In [None]:
!pip install transformers

In [17]:
import re
import pandas as pd
import numpy as np
import json
import random
import torch
import torch.nn as nn
import torch.nn.functional as F

from tqdm import tqdm, trange
from sklearn.metrics import classification_report

In [3]:
# load data
df = pd.read_csv("data_ground_truth.csv")
df.groupby('quarter').count()
df_q1 = df[(df['quarter']=='Q1_2020') | (df['quarter']=='Q1_2021') | (df['quarter']=='Q1_2022')]
df_q2 = df[(df['quarter']=='Q2_2020') | (df['quarter']=='Q2_2021') | (df['quarter']=='Q2_2022')]
df_q3 = df[(df['quarter']=='Q3_2020') | (df['quarter']=='Q3_2021') | (df['quarter']=='Q3_2022')]
df_q4 = df[(df['quarter']=='Q4_2020') | (df['quarter']=='Q4_2021') | (df['quarter']=='Q4_2022')]
# reset index for random sampling
df_q1.reset_index(inplace=True)
df_q2.reset_index(inplace=True)
df_q3.reset_index(inplace=True)
df_q4.reset_index(inplace=True)

In [4]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

### from models.py
# from models import *
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/roberta-base-hate")
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/roberta-base-hate").to(device)
model = model.to(device)

Downloading (…)okenizer_config.json:   0%|          | 0.00/380 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/847 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [5]:
# set the dataloader
from torch.utils.data import Dataset, DataLoader
class HateDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        text = self.dataframe.loc[idx]['text']
        label = self.dataframe.loc[idx]['ground_truth']
        category = self.dataframe.loc[idx]['category']
        return text, label, category

def get_dataloader(train_data, test_data, batch_size):
    train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=False)
    return train_dataloader, test_dataloader

In [15]:
# define the training function
def train(model, train_dataloader, optimizer, device):

    for batch in train_dataloader:
        batch_loss = 0
        model.train()
        optimizer.zero_grad()
        text, label, category = batch
        texts = []
        for txt in text:
            texts.append(txt)
        labels = []
        for label in label:
            if label == 1:
                label = [0, 1]
            else:
                label = [1, 0]
            labels.append(label)
        labels = torch.Tensor(labels)
        # print(labels)
        encoded_input = tokenizer(texts, return_tensors='pt', padding=True, truncation=True).to(device)
        output = model(**encoded_input)
        scores = output['logits']
        prediction_logits = torch.softmax(scores, dim=1)
        # print(prediction_logits)
        # print(labels)

        loss = F.cross_entropy(prediction_logits, labels.to(device))
        loss.backward()
        optimizer.step()

        # calculate the loss
        batch_loss += loss.item()

        # print('batch loss: ', batch_loss)

# define the evaluation function
def evaluate(model, test_dataloader, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in test_dataloader:
            text, label, category = batch
            texts = []
            for txt in text:
                texts.append(txt)

            encoded_input = tokenizer(texts, return_tensors='pt', padding=True, truncation=True).to(device)
            output = model(**encoded_input)
            scores = output['logits']
            prediction_logits = torch.softmax(scores, dim=1)
            prediction = prediction_logits.argmax(dim=1)
            # prediction = prediction_logits.argmax().item()
            correct += (prediction == label.to(device)).sum().item()
            total += len(label)
    return correct / total

In [6]:
# define the training process
def train_process(model, train_dataloader, test_dataloader, optimizer, device, epochs):
    train_acc = []
    test_acc = []
    for epoch in range(epochs):
        train(model, train_dataloader, optimizer, device)
        train_acc.append(evaluate(model, train_dataloader, device))
        test_acc.append(evaluate(model, test_dataloader, device))
        print('Epoch: {}, Train Accuracy: {}, Test Accuracy: {}'.format(epoch, train_acc[-1], test_acc[-1]))
    return train_acc, test_acc


In [7]:
# evaluate the model and save the predictions into a dataframe
def evaluate_and_save(model, test_dataloader, device):
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in test_dataloader:
            text, label, category = batch
            texts = []
            for txt in text:
                texts.append(txt)

            encoded_input = tokenizer(texts, return_tensors='pt', padding=True, truncation=True).to(device)
            output = model(**encoded_input)
            scores = output['logits']
            prediction_logits = torch.softmax(scores, dim=1)

            prediction = prediction_logits.argmax(dim=1)
            # convert to cpu
            label = label.cpu().numpy()
            prediction = prediction.cpu().numpy()
            # print(prediction, label)
            # save the predictions into a dict
            for i in range(len(prediction)):
                predictions.append({'text': text[i], 'label': label[i], 'prediction': prediction[i], 'category': category[i]})
    # save the predictions into a dataframe
    predictions_df = pd.DataFrame(predictions)
    return predictions_df

### Evaluate the 1st quarter

In [8]:
train_dataset = HateDataset(df_q1)
test_dataset = HateDataset(df_q1)
train_dataloader, test_dataloader = get_dataloader(train_dataset, test_dataset, batch_size=32)

In [9]:
df_q1result = evaluate_and_save(model, test_dataloader, device) # evaluate

In [10]:
df_q1result.groupby(['category']).count()

Unnamed: 0_level_0,text,label,prediction
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ageism,186,186,186
asian,179,179,179
mask,16,16,16
rus_ukr,158,158,158
us_capitol,311,311,311
vaccine,78,78,78


In [11]:
def calculate(dataframe):
    for cate in dataframe.category.unique().tolist():
        df_sub = dataframe[dataframe['category']==cate]
        print(cate)
        print(classification_report(df_sub['label'], df_sub['prediction'], digits=3))
        print('====================')

In [12]:
calculate(df_q1result)

vaccine
              precision    recall  f1-score   support

           0      0.708     0.939     0.807        49
           1      0.769     0.345     0.476        29

    accuracy                          0.718        78
   macro avg      0.738     0.642     0.642        78
weighted avg      0.731     0.718     0.684        78

asian
              precision    recall  f1-score   support

           0      0.595     0.926     0.724        95
           1      0.774     0.286     0.417        84

    accuracy                          0.626       179
   macro avg      0.684     0.606     0.571       179
weighted avg      0.679     0.626     0.580       179

ageism
              precision    recall  f1-score   support

           0      0.810     0.959     0.879       147
           1      0.500     0.154     0.235        39

    accuracy                          0.790       186
   macro avg      0.655     0.557     0.557       186
weighted avg      0.745     0.790     0.744       186

### Train with 1st quarter and evaluate the 2nd quarter

In [13]:
train_dataset = HateDataset(df_q1) # we use the whole q1 data to train the model
test_dataset = HateDataset(df_q2)
train_dataloader, test_dataloader = get_dataloader(train_dataset, test_dataset, batch_size=32)

In [18]:
# define the optimizer
from transformers import AdamW
optimizer = AdamW(model.parameters(), lr=1e-5)

# fine-tune the model with q1 data
train_acc, test_acc = train_process(model, train_dataloader, test_dataloader, optimizer, device, epochs=10)

Epoch: 0, Train Accuracy: 0.8383620689655172, Test Accuracy: 0.8040313549832027
Epoch: 1, Train Accuracy: 0.8933189655172413, Test Accuracy: 0.8230683090705487
Epoch: 2, Train Accuracy: 0.8825431034482759, Test Accuracy: 0.8040313549832027
Epoch: 3, Train Accuracy: 0.9267241379310345, Test Accuracy: 0.8365061590145577
Epoch: 4, Train Accuracy: 0.9288793103448276, Test Accuracy: 0.8342665173572228
Epoch: 5, Train Accuracy: 0.9428879310344828, Test Accuracy: 0.8398656215005599
Epoch: 6, Train Accuracy: 0.9536637931034483, Test Accuracy: 0.8421052631578947
Epoch: 7, Train Accuracy: 0.9536637931034483, Test Accuracy: 0.8387458006718925
Epoch: 8, Train Accuracy: 0.9579741379310345, Test Accuracy: 0.8432250839865622
Epoch: 9, Train Accuracy: 0.9655172413793104, Test Accuracy: 0.8309070548712206


In [19]:
df_q2result = evaluate_and_save(model, test_dataloader, device) # evaluate

In [20]:
df_q2result.groupby(['category']).count()

Unnamed: 0_level_0,text,label,prediction
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ageism,117,117,117
asian,296,296,296
mask,64,64,64
rus_ukr,190,190,190
us_capitol,112,112,112
vaccine,114,114,114


In [21]:
calculate(df_q2result)

ageism
              precision    recall  f1-score   support

           0      0.898     0.929     0.913        85
           1      0.793     0.719     0.754        32

    accuracy                          0.872       117
   macro avg      0.845     0.824     0.834       117
weighted avg      0.869     0.872     0.870       117

mask
              precision    recall  f1-score   support

           0      0.863     0.957     0.907        46
           1      0.846     0.611     0.710        18

    accuracy                          0.859        64
   macro avg      0.854     0.784     0.808        64
weighted avg      0.858     0.859     0.852        64

vaccine
              precision    recall  f1-score   support

           0      0.881     0.703     0.782        74
           1      0.600     0.825     0.695        40

    accuracy                          0.746       114
   macro avg      0.741     0.764     0.738       114
weighted avg      0.783     0.746     0.751       114


### Train with 2nd quarter and evaluate the 3rd quarter


In [22]:
train_dataset = HateDataset(df_q2) # we use the whole q1 data to train the model
test_dataset = HateDataset(df_q3)
train_dataloader, test_dataloader = get_dataloader(train_dataset, test_dataset, batch_size=32)

# fine-tune the model with q2 data
train_acc, test_acc = train_process(model, train_dataloader, test_dataloader, optimizer, device, epochs=10)
df_q3result = evaluate_and_save(model, test_dataloader, device) # evaluate

Epoch: 0, Train Accuracy: 0.8533034714445689, Test Accuracy: 0.813588850174216
Epoch: 1, Train Accuracy: 0.8868980963045913, Test Accuracy: 0.8214285714285714
Epoch: 2, Train Accuracy: 0.9193729003359462, Test Accuracy: 0.828397212543554
Epoch: 3, Train Accuracy: 0.9395296752519597, Test Accuracy: 0.8344947735191638
Epoch: 4, Train Accuracy: 0.9473684210526315, Test Accuracy: 0.8423344947735192
Epoch: 5, Train Accuracy: 0.9563269876819709, Test Accuracy: 0.837979094076655
Epoch: 6, Train Accuracy: 0.9585666293393057, Test Accuracy: 0.8397212543554007
Epoch: 7, Train Accuracy: 0.9630459126539753, Test Accuracy: 0.8353658536585366
Epoch: 8, Train Accuracy: 0.9652855543113102, Test Accuracy: 0.8344947735191638
Epoch: 9, Train Accuracy: 0.9675251959686451, Test Accuracy: 0.8440766550522648


In [23]:
df_q3result.groupby(['category']).count()

Unnamed: 0_level_0,text,label,prediction
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ageism,114,114,114
asian,331,331,331
mask,249,249,249
rus_ukr,192,192,192
us_capitol,158,158,158
vaccine,104,104,104


In [24]:
calculate(df_q3result)

ageism
              precision    recall  f1-score   support

           0      0.920     0.873     0.896        79
           1      0.744     0.829     0.784        35

    accuracy                          0.860       114
   macro avg      0.832     0.851     0.840       114
weighted avg      0.866     0.860     0.862       114

asian
              precision    recall  f1-score   support

           0      0.805     0.838     0.821       148
           1      0.864     0.836     0.850       183

    accuracy                          0.837       331
   macro avg      0.835     0.837     0.836       331
weighted avg      0.838     0.837     0.837       331

mask
              precision    recall  f1-score   support

           0      0.917     0.907     0.912       182
           1      0.754     0.776     0.765        67

    accuracy                          0.871       249
   macro avg      0.835     0.841     0.838       249
weighted avg      0.873     0.871     0.872       249

v

### Train with 3nd quarter and evaluate the 4rd quarter

In [25]:
train_dataset = HateDataset(df_q3) # we use the whole q1 data to train the model
test_dataset = HateDataset(df_q4)
train_dataloader, test_dataloader = get_dataloader(train_dataset, test_dataset, batch_size=32)

# fine-tune the model with q2 data
train_acc, test_acc = train_process(model, train_dataloader, test_dataloader, optimizer, device, epochs=10)
df_q4result = evaluate_and_save(model, test_dataloader, device) # evaluate

Epoch: 0, Train Accuracy: 0.8614982578397212, Test Accuracy: 0.8312318137730359
Epoch: 1, Train Accuracy: 0.8858885017421603, Test Accuracy: 0.8409311348205626
Epoch: 2, Train Accuracy: 0.8980836236933798, Test Accuracy: 0.8409311348205626
Epoch: 3, Train Accuracy: 0.9050522648083623, Test Accuracy: 0.8263821532492726
Epoch: 4, Train Accuracy: 0.9198606271777003, Test Accuracy: 0.8341416100872939
Epoch: 5, Train Accuracy: 0.9137630662020906, Test Accuracy: 0.8079534432589719
Epoch: 6, Train Accuracy: 0.9346689895470384, Test Accuracy: 0.8302618816682832
Epoch: 7, Train Accuracy: 0.936411149825784, Test Accuracy: 0.8234723569350145
Epoch: 8, Train Accuracy: 0.9329268292682927, Test Accuracy: 0.8176527643064986
Epoch: 9, Train Accuracy: 0.9416376306620209, Test Accuracy: 0.8283220174587779


In [26]:
df_q4result.groupby(['category']).count()

Unnamed: 0_level_0,text,label,prediction
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ageism,161,161,161
asian,262,262,262
mask,199,199,199
rus_ukr,60,60,60
us_capitol,123,123,123
vaccine,226,226,226


In [27]:
calculate(df_q4result)

ageism
              precision    recall  f1-score   support

           0      0.686     0.857     0.762        84
           1      0.786     0.571     0.662        77

    accuracy                          0.720       161
   macro avg      0.736     0.714     0.712       161
weighted avg      0.734     0.720     0.714       161

mask
              precision    recall  f1-score   support

           0      0.890     0.817     0.852       109
           1      0.798     0.878     0.836        90

    accuracy                          0.844       199
   macro avg      0.844     0.847     0.844       199
weighted avg      0.848     0.844     0.845       199

vaccine
              precision    recall  f1-score   support

           0      0.907     0.913     0.910       149
           1      0.829     0.818     0.824        77

    accuracy                          0.881       226
   macro avg      0.868     0.865     0.867       226
weighted avg      0.880     0.881     0.880       226
