In [None]:
#this notebook has been made to run on google colab rather than locally - this is due to the gpu runtime feature.
import torch
from tqdm.notebook import tqdm

!pip install transformers
from transformers import BertTokenizer
from torch.utils.data import TensorDataset

from transformers import BertForSequenceClassification

import pandas as pd
import numpy as np

Collecting transformers
  Downloading transformers-4.9.1-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 13.7 MB/s 
Collecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 43.8 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 48.2 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 47.1 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninstalled 

In [None]:
#load data and test train split
pd.options.display.float_format = '{:.0f}'.format
import io
from google.colab import files

#used for google colab file upload
uploaded = files.upload()
df = pd.read_csv(io.StringIO(uploaded['clean_dataset.csv'].decode('utf-8')))

df.head(5)

Saving clean_dataset.csv to clean_dataset.csv


Unnamed: 0,id,text,Annotation,clean_text,clean_text_sl,Hate
0,596965466238619648,I just found the *perfect* rental. Why can't m...,Neither,i just found the perfect rental why cant my le...,found perfect rental cant lease right,0
1,575891807873040384,"@wetsprocket every time they ""discover"" anythi...",Neither,every time they discover anything its either ...,every time discover anything either incorrect ...,0
2,595259457828884480,ok time to write code bbl.\n\nmaking a new thing.,Neither,ok time to write code bbl\n\nmaking a new thing,ok time write code bbl making new thing,0
3,572344911002927104,Refined dessert! NOT #MKR,Neither,refined dessert not mkr,refined dessert mkr,0
4,568132033215475712,@furt1v3ly @ClarkHat One of the best things an...,Neither,one of the best things anyone can do to impr...,one best thing anyone improve understanding gr...,0


In [None]:
class_labels = df.Annotation.unique()

#create dictionary of class labels, string->numeric
label_dict = {}
for index, class_labels in enumerate(class_labels):
    label_dict[class_labels] = index
label_dict

{'Homophobia': 3, 'Neither': 0, 'Racism': 2, 'Sexism': 1}

In [None]:
#new df just with necessary columns

dataset = pd.DataFrame()
dataset['tweets'] = df['clean_text']
dataset['tweets_sl'] = df['clean_text_sl']
dataset['Annotation'] = df['Annotation']
dataset['class_label'] = df.Annotation.replace(label_dict)

dataset.head(5)

Unnamed: 0,tweets,tweets_sl,Annotation,class_label
0,i just found the perfect rental why cant my le...,found perfect rental cant lease right,Neither,0
1,every time they discover anything its either ...,every time discover anything either incorrect ...,Neither,0
2,ok time to write code bbl\n\nmaking a new thing,ok time write code bbl making new thing,Neither,0
3,refined dessert not mkr,refined dessert mkr,Neither,0
4,one of the best things anyone can do to impr...,one best thing anyone improve understanding gr...,Neither,0


In [None]:
#stratified and non-stratified train/test sets as an experiment with imbalance

from sklearn.model_selection import train_test_split

#stratify
# X_train, X_test, y_train, y_test = train_test_split(dataset.index.values, 
#                                                   dataset.class_label.values, 
#                                                   test_size=0.25, 
#                                                   random_state=12, 
#                                                   stratify=dataset.class_label.values)

#non-stratify
X_train, X_test, y_train, y_test = train_test_split(dataset.index.values, 
                                                  dataset.class_label.values, 
                                                  test_size=0.25, 
                                                  random_state=12)



dataset.loc[X_train, 'splits'] = 'train'
dataset.loc[X_test, 'splits'] = 'test'

dataset.groupby(['Annotation', 'class_label', 'splits']).count()


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,tweets,tweets_sl
Annotation,class_label,splits,Unnamed: 3_level_1,Unnamed: 4_level_1
Homophobia,3,test,26,26
Homophobia,3,train,61,61
Neither,0,test,1427,1427
Neither,0,train,4292,4292
Racism,2,test,18,18
Racism,2,train,80,80
Sexism,1,test,233,233
Sexism,1,train,678,678


In [None]:
#base BERT
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased', 
                                          do_lower_case=True)

# #large BERT
# tokenizer = BertTokenizer.from_pretrained('bert-large-uncased', 
#                                           do_lower_case=True)


# #sl preprocessing
# encoded_data_train = tokenizer.batch_encode_plus(
#     dataset[dataset.splits=='train'].tweets_sl.values, 
#     add_special_tokens=True, 
#     return_attention_mask=True, 
#     padding='longest', 
#     return_tensors='pt'
# )

# encoded_data_test = tokenizer.batch_encode_plus(
#     dataset[dataset.splits=='test'].tweets_sl.values, 
#     add_special_tokens=True, 
#     return_attention_mask=True, 
#     padding='longest', 
#     return_tensors='pt'
#)

#no sl preprocessing
encoded_data_train = tokenizer.batch_encode_plus(
    dataset[dataset.splits=='train'].tweets.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    padding='longest', 
    return_tensors='pt'
)

encoded_data_test = tokenizer.batch_encode_plus(
    dataset[dataset.splits=='test'].tweets.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    padding='longest', 
    return_tensors='pt'
)


input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(dataset[dataset.splits=='train'].class_label.values)

input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']
labels_test = torch.tensor(dataset[dataset.splits=='test'].class_label.values)

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_test = TensorDataset(input_ids_test, attention_masks_test, labels_test)

print("Example padded input ids: ", dataset_train[0][0])

Example padded input ids:  tensor([  101,  2296,  2051,  2027,  7523,  2505,  2049,  2593, 16542,  2030,
         2242,  4921,  2063,  2056,  7271,  2006,  2026,  9927,  2027,  2024,
        11809,   102,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0])


In [None]:
#base
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

# # #large
# model = BertForSequenceClassification.from_pretrained("bert-large-uncased",
#                                                       num_labels=len(label_dict),
#                                                       output_attentions=False,
#                                                       output_hidden_states=False)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 8

dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_test = DataLoader(dataset_test, 
                                   sampler=SequentialSampler(dataset_test), 
                                   batch_size=batch_size)

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
                  lr=1e-5, 
                  eps=1e-8)
                  
epochs = 10

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)

In [None]:
from sklearn.metrics import f1_score, recall_score, precision_score

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='macro')

def recall_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return recall_score(labels_flat, preds_flat, average='macro')

def precision_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return precision_score(labels_flat, preds_flat, average='macro')


def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device = "cuda:0"
model = model.to(device)

import random

seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals
    
for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    torch.save(model.state_dict(), f'finetuned_BERT_epoch_{epoch}.model')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    test_loss, predictions, true_vals = evaluate(dataloader_test)
    test_f1 = f1_score_func(predictions, true_vals)
    test_recall = recall_score_func(predictions, true_vals)
    test_precision = precision_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {test_loss}')
    tqdm.write(f'F1 Score (Macro): {test_f1}')
    tqdm.write(f'Recall Score (Macro): {test_recall}')
    tqdm.write(f'Precision Score (Macro): {test_precision}')

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=639.0, style=ProgressStyle(description_widt…


Epoch 1
Training loss: 0.4355372210850878
Validation loss: 0.33009136362578456
F1 Score (Macro): 0.41379750477124366
Recall Score (Macro): 0.40338911429181545
Precision Score (Macro): 0.4291884482978452


  _warn_prf(average, modifier, msg_start, len(result))


HBox(children=(FloatProgress(value=0.0, description='Epoch 2', max=639.0, style=ProgressStyle(description_widt…


Epoch 2
Training loss: 0.2688735027852904
Validation loss: 0.37149504816242684
F1 Score (Macro): 0.7035620543095626
Recall Score (Macro): 0.6559746158028518
Precision Score (Macro): 0.7794278527137608


HBox(children=(FloatProgress(value=0.0, description='Epoch 3', max=639.0, style=ProgressStyle(description_widt…


Epoch 3
Training loss: 0.19119396316636708
Validation loss: 0.34125958838032233
F1 Score (Macro): 0.7379273002113104
Recall Score (Macro): 0.7095310779827805
Precision Score (Macro): 0.7894157877867298


HBox(children=(FloatProgress(value=0.0, description='Epoch 4', max=639.0, style=ProgressStyle(description_widt…


Epoch 4
Training loss: 0.12747359263320077
Validation loss: 0.48350974408573005
F1 Score (Macro): 0.7406617225803346
Recall Score (Macro): 0.7417836732397127
Precision Score (Macro): 0.7728648279911579


HBox(children=(FloatProgress(value=0.0, description='Epoch 5', max=639.0, style=ProgressStyle(description_widt…


Epoch 5
Training loss: 0.09177733645081215
Validation loss: 0.550401301317774
F1 Score (Macro): 0.7423027135991634
Recall Score (Macro): 0.744032815154665
Precision Score (Macro): 0.7609366968890906


HBox(children=(FloatProgress(value=0.0, description='Epoch 6', max=639.0, style=ProgressStyle(description_widt…


Epoch 6
Training loss: 0.06648968274641165
Validation loss: 0.5911955690160864
F1 Score (Macro): 0.7339938155485622
Recall Score (Macro): 0.7594322712468767
Precision Score (Macro): 0.735333111463668


HBox(children=(FloatProgress(value=0.0, description='Epoch 7', max=639.0, style=ProgressStyle(description_widt…


Epoch 7
Training loss: 0.049345964702192026
Validation loss: 0.6591810934227004
F1 Score (Macro): 0.7160596510814365
Recall Score (Macro): 0.7531696957185167
Precision Score (Macro): 0.7114341676841677


HBox(children=(FloatProgress(value=0.0, description='Epoch 8', max=639.0, style=ProgressStyle(description_widt…


Epoch 8
Training loss: 0.040717354708331883
Validation loss: 0.6267392148626252
F1 Score (Macro): 0.7670401235313389
Recall Score (Macro): 0.7600458216888436
Precision Score (Macro): 0.7940355258776312


HBox(children=(FloatProgress(value=0.0, description='Epoch 9', max=639.0, style=ProgressStyle(description_widt…


Epoch 9
Training loss: 0.03284022725681506
Validation loss: 0.6405541960772662
F1 Score (Macro): 0.7534871988458811
Recall Score (Macro): 0.758469087280995
Precision Score (Macro): 0.7723391944981841


HBox(children=(FloatProgress(value=0.0, description='Epoch 10', max=639.0, style=ProgressStyle(description_wid…


Epoch 10
Training loss: 0.02630559062943772
Validation loss: 0.6664186407286704
F1 Score (Macro): 0.7523901801509149
Recall Score (Macro): 0.7575713186195876
Precision Score (Macro): 0.7679414198481995



In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

model.to(device)

model.load_state_dict(torch.load('finetuned_BERT_epoch_8.model'))

_, predictions, true_vals = evaluate(dataloader_test)
accuracy_per_class(predictions, true_vals)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Class: Neither
Accuracy: 1383/1427

Class: Sexism
Accuracy: 144/233

Class: Racism
Accuracy: 13/18

Class: Homophobia
Accuracy: 19/26



In [None]:

overall_f1 = f1_score_func(predictions, true_vals)
overall_recall = recall_score_func(predictions, true_vals)
overall_precision = precision_score_func(predictions, true_vals)

print("macro_f1", overall_f1)
print("macro_recall", overall_recall)
print("macro_precision", overall_precision)

macro_f1 0.7670401235313389
macro_recall 0.7600458216888436
macro_precision 0.7940355258776312
