In [2]:
!pip install torcheval



In [3]:
import json
import os
import torch
import torcheval

from torcheval.metrics.functional import binary_f1_score, binary_precision, binary_recall
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from typing import List
from dataclasses import dataclass
from torch.utils.data import Dataset, DataLoader
from google.colab import drive

PATH = '/content/drive/MyDrive/'
drive.mount('/content/drive')
os.chdir(PATH)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
@dataclass
class HateSpeechExample:
  text: str
  label: int

  @staticmethod
  def from_list(sample):
    text, label = sample[0], sample[1]
    if label == "normal":
      label = 0
    else:
      label = 1
    return HateSpeechExample(text, label)

In [5]:
class HateSpeechDataset(Dataset):
  tokenizer = None

  def __init__(self, raw_data_list, tokenizer):
    HateSpeechDataset.tokenizer = tokenizer
    self.data = [HateSpeechExample.from_list(sample) for sample in raw_data_list]

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    return self.data[idx]

  def __iter__(self):
    return iter(self.data)

  @staticmethod
  def collate_fn(samples: List[HateSpeechExample]):
    # get the encoding of each thing
    # get the labels of each thing
    texts = [sample.text for sample in samples]
    labels = [sample.label for sample in samples]

    encoding = HateSpeechDataset.tokenizer(texts,
                                   padding=True,
                                   max_length=512,
                                   truncation=True,
                                   return_tensors="pt")

    return {'encoding': encoding, 'labels' : torch.tensor(labels, dtype=torch.long)}

In [6]:
def get_dataset(file, tokenizer):
  with open(file, 'r') as f:
    data = json.load(f)
  return HateSpeechDataset(data, tokenizer)

In [7]:
def initialize_datasets(tokenizer, dataset_name):
  # return a dictionary of train, test, validation datasets
  datasets = {}
  data_names = ['test', 'train']
  for data_name in data_names:
    datasets[data_name] = get_dataset(f'{dataset_name}/{data_name}.json', tokenizer)
  return datasets

In [11]:
import torch.nn as nn
from torch.optim import Optimizer
from tqdm import tqdm

def train_one_epoch(model: nn.Module, dataloader: DataLoader, optimizer: Optimizer, epoch: int):
    """
    Train the model for one epoch.
    :param model: A pre-trained model loaded from transformers. (e.g., RobertaForSequenceClassification https://huggingface.co/docs/transformers/v4.37.0/en/model_doc/roberta#transformers.RobertaForSequenceClassification)
    :param dataloader: A train set dataloader for SST2Dataset.
    :param optimizer: An instance of Pytorch optimizer. (e.g., AdamW https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html)
    :param epoch: An integer denoting current epoch.
    Trains model for one epoch.
    """
    model.train()

    with tqdm(dataloader, desc=f"Train Ep {epoch}", total=len(dataloader)) as tq:
        for batch in tq:
            text_encoding = batch['encoding'].to(model.device)
            label_encoding = batch['labels'].to(model.device)

            loss = model(**text_encoding, labels=label_encoding)[0]
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            tq.set_postfix({"loss": loss.detach().item()}) # for printing better-looking progress bar

In [12]:
def evaluate(model: nn.Module, dataloader: DataLoader) -> float:
    """
    Evaluate model on the dataloader and compute the accuracy.
    :param model: A language model loaded from transformers. (e.g., RobertaForSequenceClassification https://huggingface.co/docs/transformers/v4.37.0/en/model_doc/roberta#transformers.RobertaForSequenceClassification)
    :param dataloader: A validation / test set dataloader for SST2Dataset
    :return: A floating number representing the accuracy of model in the given dataset.
    """
    model.eval()

    all_predictions = []
    all_labels = []
    with tqdm(dataloader, desc=f"Eval", total=len(dataloader)) as tq:
        for batch in tq:
            with torch.no_grad():
                text_encoding = batch['encoding'].to(model.device)
                label_encoding = batch['labels'].to(model.device)

                logits = model(**text_encoding, labels=label_encoding)[1]

                predictions = torch.argmax(logits, dim=-1)
                labels = label_encoding.clone().detach()

                all_predictions += predictions
                all_labels += labels

    all_predictions = torch.LongTensor(all_predictions)
    all_labels = torch.LongTensor(all_labels)
    accuracy = compute_accuracy(all_predictions, all_labels)
    f1_score = binary_f1_score(all_predictions, all_labels)
    precision = binary_precision(all_predictions, all_labels)
    recall = binary_recall(all_predictions, all_labels)
    return_dict = {
        "Accuracy": accuracy,
        "F1 score": f1_score,
        "precision": precision,
        "recall": recall
    }

    print(return_dict)
    return return_dict


def compute_accuracy(predictions: torch.Tensor, labels: torch.Tensor) -> float:
    """
    Given two tensors predictions and labels, compute the accuracy.
    :param predictions: torch.Tensor of size (N,)
    :param labels: torch.Tensor of size (N,)
    :return: A floating number representing the accuracy
    """
    assert predictions.size(-1) == labels.size(-1)

    accuracy = torch.mean(1.0 * (predictions == labels))
    return accuracy.item()

In [9]:
torch.LongTensor([1, 2, 3])

tensor([1, 2, 3])

In [15]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

torch.manual_seed(64)

def main(dataset_name='SWSR-CORE'):
    # hyper-parameters (we provide initial set of values here, but you can modify them.)
    batch_size = 16
    learning_rate = 5e-5
    num_epochs = 10
    model_name = "Geotrend/distilbert-base-zh-cased"

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    # load model on GPU.
    model = model.cuda()

    optimizer = torch.optim.AdamW(params=model.parameters(), lr=learning_rate, eps=1e-8)

    datasets = initialize_datasets(tokenizer, dataset_name)

    train_dataloader = DataLoader(datasets['train'],
                                   batch_size=batch_size,
                                   shuffle=True,
                                   collate_fn=HateSpeechDataset.collate_fn,
                                   num_workers=2)

    #validation_dataloader = DataLoader(datasets['validation'],
                                  #  batch_size=batch_size,
                                  #  shuffle=False,
                                  #  collate_fn=HateSpeechDataset.collate_fn,
                                  #  num_workers=2)

    train_acc_history, val_acc_history = [], []

    best_acc = 0.0
    for epoch in range(1, num_epochs + 1):
        train_one_epoch(model, train_dataloader, optimizer, epoch)
        train_acc = evaluate(model, train_dataloader)
        #valid_acc = evaluate(model, validation_dataloader)

        train_acc_history.append(train_acc)
        #val_acc_history.append(valid_acc)

        #if valid_acc > best_acc:
        torch.save(model, f'./checkpoints/best_model_{dataset_name}.pth')
        #best_acc = valid_acc

    return train_acc_history, val_acc_history

In [11]:
if not os.path.exists('./checkpoints'):
    # If the directory does not exist, create it
    os.makedirs('./checkpoints')
    print("Directory created")
else:
    print("Directory already exists")

Directory already exists


## Train Model on HateExplain

In [18]:
from matplotlib import pyplot as plt
import numpy as np

train_acc_history, val_acc_history = main()
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.plot(train_acc_history, label='train')
#plt.plot(val_acc_history, label='val')
plt.title('HateExplain')
plt.legend()
plt.show()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at Geotrend/distilbert-base-zh-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Train Ep 1:   2%|▏         | 8/505 [00:02<02:09,  3.83it/s, loss=0.574]Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7964b45cb880>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1478, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1442, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/usr/lib/python3.10/multiprocessing/process.py", line 149, in join
    res = self._popen.wait(timeout)
  File "/usr/lib/python3.10/multi

KeyboardInterrupt: 

In [19]:
train_acc_history

[{'Accuracy': 0.826684832572937,
  'F1 score': tensor(0.7445),
  'precision': tensor(0.7593),
  'recall': tensor(0.7302)},
 {'Accuracy': 0.8799554109573364,
  'F1 score': tensor(0.8177),
  'precision': tensor(0.8609),
  'recall': tensor(0.7786)},
 {'Accuracy': 0.9400396347045898,
  'F1 score': tensor(0.9121),
  'precision': tensor(0.9249),
  'recall': tensor(0.8997)},
 {'Accuracy': 0.9706392288208008,
  'F1 score': tensor(0.9584),
  'precision': tensor(0.9397),
  'recall': tensor(0.9778)},
 {'Accuracy': 0.9878592491149902,
  'F1 score': tensor(0.9826),
  'precision': tensor(0.9740),
  'recall': tensor(0.9914)},
 {'Accuracy': 0.9929385781288147,
  'F1 score': tensor(0.9898),
  'precision': tensor(0.9924),
  'recall': tensor(0.9871)},
 {'Accuracy': 0.9921952486038208,
  'F1 score': tensor(0.9886),
  'precision': tensor(0.9967),
  'recall': tensor(0.9807)},
 {'Accuracy': 0.9964073300361633,
  'F1 score': tensor(0.9948),
  'precision': tensor(0.9971),
  'recall': tensor(0.9925)},
 {'Accura

In [None]:
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.plot(train_acc_history, label='train')
#plt.plot(val_acc_history, label='val')
plt.title('HateExplain')
plt.legend()
plt.show()

In [None]:
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.plot(train_acc_history, label='train')
plt.plot(val_acc_history, label='val')
plt.title('HateExplain')
plt.legend()
plt.show()

In [25]:
best_model = torch.load('./checkpoints/best_model_SWSR-CORE.pth')

In [28]:

datasets = initialize_datasets(tokenizer, 'HateExplain')
validation_dataloader = DataLoader(datasets['validation'],
                                    batch_size=64,
                                    shuffle=False,
                                    collate_fn=HateSpeechDataset.collate_fn,
                                    num_workers=2)
evaluate(best_model, validation_dataloader)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

FileNotFoundError: [Errno 2] No such file or directory: 'HateExplain/test.json'

In [None]:
test_dataloader = DataLoader(datasets['test'],
                              batch_size=64,
                              shuffle=False,
                              collate_fn=HateSpeechDataset.collate_fn,
                              num_workers=2)
evaluate(best_model, test_dataloader)

In [13]:
best_model = torch.load('./checkpoints/best_model_SWSR-CORE.pth')
tokenizer = AutoTokenizer.from_pretrained('Geotrend/distilbert-base-zh-cased')
datasets = initialize_datasets(tokenizer, "SWSR-CORE")
test_dataloader = DataLoader(datasets['test'],
                              batch_size=64,
                              shuffle=False,
                              collate_fn=HateSpeechDataset.collate_fn,
                              num_workers=2)
evaluate(best_model, test_dataloader)

Eval: 100%|██████████| 14/14 [00:11<00:00,  1.27it/s]


{'Accuracy': 0.7834821343421936, 'F1 score': tensor(0.6767), 'precision': tensor(0.6812), 'recall': tensor(0.6722)}


{'Accuracy': 0.7834821343421936,
 'F1 score': tensor(0.6767),
 'precision': tensor(0.6812),
 'recall': tensor(0.6722)}

In [None]:
evaluate(best_model, test_dataloader)

## Train Chinese Baseline model

In [None]:
train_acc_history, val_acc_history = main('SWSR')

# Plot chinese accuracy results

In [None]:
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.plot(train_acc_history, label='train')
plt.plot(val_acc_history, label='val')
plt.title('RBE-Chinese')
plt.legend()
plt.show()

## Evaluate Baseline Chinese model on test dataset

In [1]:
dataset_name = 'SWSR'
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
datasets = initialize_datasets(tokenizer, dataset_name)
best_model = torch.load(f'./checkpoints/best_model_{dataset_name}.pth')
validation_dataloader = DataLoader(datasets['validation'],
                                   batch_size=64,
                                   shuffle=False,
                                   collate_fn=HateSpeechDataset.collate_fn,
                                   num_workers=2)
evaluate(best_model, validation_dataloader)

NameError: name 'AutoTokenizer' is not defined

In [None]:
test_dataloader = DataLoader(datasets['test'],
                              batch_size=64,
                              shuffle=False,
                              collate_fn=HateSpeechDataset.collate_fn,
                              num_workers=2)
evaluate(best_model, test_dataloader)