In [1]:
cwk_dir ="drive/MyDrive/NLU Coursework/" #For running in Jack's Google Drive

# Setup

## Connect Google Drive Folder

In [2]:
from google.colab import drive
import os
import sys

drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


## Imports

In [3]:
sys.path.append(cwk_dir)
from classes.evaluation import evaluate
from classes.preprocessing import load_data, augment_data

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler,random_split
from transformers import RobertaForSequenceClassification, RobertaTokenizer, RobertaModel, get_constant_schedule_with_warmup, get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup
from torch.nn import Linear, AvgPool2d, CrossEntropyLoss, Dropout, Tanh

import torch

In [5]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import typing
from PIL import Image
import json
import gensim.downloader as api
from gensim.models import Word2Vec
import nltk
from nltk.tokenize import word_tokenize
import string

## Primary Variables

Filepath variables

In [6]:
solution_dir = os.path.join(cwk_dir, "solution_C")
models_dir = os.path.join(solution_dir, "models/roberta")
results_dir = os.path.join(solution_dir, "results")

Training variables

In [7]:
INITIAL_LR: float = 2e-5
EPOCHS: int = 6
VALIDATION_SPLIT: float = 0.2
BATCH_SIZE: int = 16

# BERT_ID: str = 'bert-base-uncased'
BERT_ID: str = 'roberta-base'
NUM_LABELS: int = 2

BERT Keys

In [8]:
INPUTS_IDS_KEY: str = "input_ids"
ATTENTION_MASK_KEY: str = "attention_mask"
TOKEN_TYPE_KEY: str = "token_type_ids"

Other

In [9]:
MAX_SEQ_LENGTH: int = 512 #None is the value to denote that there is no max length. Max length is recommended
VOCAB_SIZE: int = None #None is the value to denote that there is no vocab size yet. This is set later, once we have the training data
EMBEDDING_SIZE: int = None

## Functions

In [10]:
def tokenize_data(tokenizer: RobertaTokenizer, premises: typing.List[str], hypotheses: typing.List[str], maxlen: int) ->typing.Tuple[np.array, np.array]:
  """
  Uses the input tokenizer to tokenizer the premises & hypotheses together. Will padd/truncate the sequences of tokens correctly. Formats the sequences together of the format below

      sample = [CLS] Premise [SEP] Hypothesis [SEP]
  """
  return tokenizer(premises, hypotheses, max_length=maxlen, padding="max_length", truncation=True, return_tensors="pt", add_special_tokens=True)

In [11]:
def get_accuracy(preds, labels) -> float:
  """
  Gets the accuracy between the predictions and labels. Returns this float
  """
  pred_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = labels.flatten()
  return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [12]:
test = [1,2,3,4,5]
index = 3
test = test[:(index - 1)] + test[(index):]
test

[1, 2, 4, 5]

# Data Preprocessing

## Load Data

In [35]:
(train_premises, train_hypotheses, train_labels), (dev_premises, dev_hypotheses, dev_labels) = load_data(cwk_dir)

In [36]:
train_labels = torch.tensor([int(x) for x in train_labels])
dev_labels = torch.tensor([int(x) for x in dev_labels])

### Data Augmentation
Different data augmentations are inspired by the paper found [here](https://link.springer.com/article/10.1186/s40537-021-00492-0)

In [37]:
# print(f"{len(train_premises)} samples before augmentation")
# train_premises, train_hypotheses, train_labels = augment_data(train_premises, train_hypotheses, train_labels, premise_quantity = 3, hypothesis_quantity=3)
# train_labels = torch.tensor(train_labels)
# print(f"{len(train_premises)} samples after augmentation")

## Tokenize Data

In [38]:
tokenizer = RobertaTokenizer.from_pretrained(BERT_ID, do_lower_case=True)

In [39]:
train_data = tokenize_data(tokenizer=tokenizer, premises=list(train_premises), hypotheses=list(train_hypotheses), maxlen=MAX_SEQ_LENGTH)
dev_data = tokenize_data(tokenizer=tokenizer, premises=list(dev_premises), hypotheses=list(dev_hypotheses), maxlen=MAX_SEQ_LENGTH) #Dev is used for evaluation

Example of a sentence:

In [40]:
print(f"Sentence: {tokenizer.convert_ids_to_tokens(train_data[INPUTS_IDS_KEY][0])}")
print(f"Tokens: {train_data[INPUTS_IDS_KEY]}")

Sentence: ['<s>', 'However', ',', 'ĠFort', 'ĠCharles', 'Ġwas', 'Ġrebuilt', 'Ġas', 'Ġa', 'Ġmilitary', 'Ġand', 'Ġnaval', 'Ġgarrison', ',', 'Ġand', 'Ġit', 'Ġprotected', 'ĠJamaica', 'Ġand', 'Ġmuch', 'Ġof', 'Ġthe', 'ĠEnglish', 'ĠCaribbean', 'Ġfor', 'Ġ250', 'Ġyears', 'Ġuntil', 'Ġthe', 'Ġadvent', 'Ġof', 'Ġste', 'ams', 'hips', 'Ġand', 'Ġyet', 'Ġanother', 'Ġearthquake', 'Ġin', 'Ġ1907', 'Ġsaw', 'Ġits', 'Ġdecline', '.', '</s>', '</s>', 'Fort', 'ĠCharles', 'Ġwas', 'Ġrebuilt', 'Ġas', 'Ġan', 'Ġamusement', 'Ġpark', 'Ġfor', 'Ġthe', 'Ġlocals', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pa

In [41]:
VOCAB_SIZE = tokenizer.vocab_size
print(f"Vocabulary size: {VOCAB_SIZE}")

Vocabulary size: 50265


## Format Dataset & Dataloader

In [42]:
dataset = TensorDataset(train_data[INPUTS_IDS_KEY], train_data[ATTENTION_MASK_KEY], train_labels)
test_dataset = TensorDataset(dev_data[INPUTS_IDS_KEY], dev_data[ATTENTION_MASK_KEY], dev_labels) #note here that the dev dataset is used for testing (evaluation) later

In [43]:
train_dataset, val_dataset = random_split(dataset, [(1 - VALIDATION_SPLIT), VALIDATION_SPLIT])

In [44]:
train_dataloader = DataLoader(train_dataset, batch_size = BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size = BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size = BATCH_SIZE)

# Model Training

## Model Architecture
Inspiration:
- https://arxiv.org/pdf/2105.03791.pdf
- https://aclanthology.org/D15-1075.pdf

A larger network is better:
- https://arxiv.org/pdf/2110.01518.pdf

In [23]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [25]:
model = RobertaForSequenceClassification.from_pretrained(BERT_ID, num_labels=NUM_LABELS)

model = model.to(device)
model

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

## Learning Rate

In [26]:
OPTIM = torch.optim.AdamW(model.parameters(), lr=INITIAL_LR)
SCHEDULER = get_linear_schedule_with_warmup(OPTIM, num_warmup_steps = int(len(train_dataset) * 1), num_training_steps = int(len(train_dataset) * EPOCHS))

## Model Training

In [27]:
#Loss metric
loss_function = CrossEntropyLoss().to(device)

In [28]:
patience = 2
min_delta = 0.01
best_val_loss = 1000
current_patience = 0

best_model_filename: str = "best_solution_C.pt"

In [29]:
for epoch in range(EPOCHS):
  ## Training
  model.train()
  total_loss = 0
  total_accuracy = 0
  for batch in train_dataloader:
    OPTIM.zero_grad()

    input_ids, attention_mask, labels = [part.to(device) for part in batch]

    outputs = model(input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels)

    loss = outputs.loss
    total_loss += loss.item()
    loss.backward()

    total_accuracy += get_accuracy(outputs.logits.detach().cpu().numpy(), labels.to('cpu').numpy())

    OPTIM.step()
    SCHEDULER.step()

  avg_train_loss = total_loss / len(train_dataloader)
  avg_train_accuracy = total_accuracy / len(train_dataloader)
  print(f"Epoch {epoch+1}, Train Average Accuracy: {avg_train_accuracy}, Training Average Loss: {avg_train_loss}")

  ##Validation
  model.eval()
  total_val_accuracy = 0
  total_val_loss = 0

  for batch in val_dataloader:
    input_ids, attention_mask, labels = [part.to(device) for part in batch]

    with torch.no_grad():
      outputs = model(input_ids = input_ids,
                      attention_mask = attention_mask,
                      labels=labels)

    loss = outputs.loss.detach().cpu().numpy()
    logits = outputs.logits.detach().cpu().numpy()
    total_val_loss  += loss.item()

    total_val_accuracy += get_accuracy(logits, labels.to('cpu').numpy())

  avg_val_accuracy = total_val_accuracy / len(val_dataloader)
  avg_val_loss = total_val_loss / len(val_dataloader)

  print(f"Epoch {epoch+1}, Validation Average Accuracy: {avg_val_accuracy}, Validation Average Loss: {avg_val_loss}")

  #Early stopping
  if avg_val_loss < (best_val_loss - min_delta):
    best_val_loss = avg_val_loss
    current_patience = 0

    #Save the best model so far
    torch.save(model.state_dict(), os.path.join(models_dir, best_model_filename))
    print(f"Best model recorded at epoch {epoch+1}")
  else:
    current_patience += 1
    if current_patience >= patience:
        print(f"Early stopping at epoch {epoch+1}")
        break

Epoch 1, Train Average Accuracy: 0.515676738410596, Training Average Loss: 0.6933099961784107
Epoch 1, Validation Average Accuracy: 0.6764427625354777, Validation Average Loss: 0.6401590467091428
Best model recorded at epoch 1
Epoch 2, Train Average Accuracy: 0.7984012831125827, Training Average Loss: 0.43885600846260786
Epoch 2, Validation Average Accuracy: 0.8370683538315988, Validation Average Loss: 0.37089154372201455
Best model recorded at epoch 2
Epoch 3, Train Average Accuracy: 0.8518211920529801, Training Average Loss: 0.34089778601161097
Epoch 3, Validation Average Accuracy: 0.8559307000946075, Validation Average Loss: 0.3300631187843862
Best model recorded at epoch 3
Epoch 4, Train Average Accuracy: 0.8877793874172185, Training Average Loss: 0.2723777720878912
Epoch 4, Validation Average Accuracy: 0.870727885525071, Validation Average Loss: 0.30752891174129005
Best model recorded at epoch 4
Epoch 5, Train Average Accuracy: 0.9160544288079471, Training Average Loss: 0.20748002

In [30]:
torch.save(model.state_dict(), os.path.join(models_dir, "solution_C.pt"))

# Model Evaluation
Here we have balanced data & both classes are equally important. Therefore it is best to look at the macro-averaged performance metrics. Below details the metrics of:
- Accuracy
- Loss
- Precision
  - Macro
  - Weighted Macro
- Recall
  - Macro
  - Weighted Macro
- F-Score
  - Macro
  - Weighted Macro
- MCC


In [45]:
#Load in the best saved model
model.load_state_dict(torch.load(os.path.join(models_dir, best_model_filename))) #Disabled loading in the best model so far. Found that the later model performs better

<All keys matched successfully>

In [46]:
#Get the predictions for all of the test cases
predicted_logits = []

for batch in test_dataloader:
  input_ids, attention_mask, labels = [part.to(device) for part in batch]

  with torch.no_grad():
    outputs = model(input_ids = input_ids,
                    attention_mask = attention_mask,
                    labels=labels)

  predicted_logits.extend(outputs.logits.detach().cpu())

In [47]:
test_metrics = evaluate(true_labels=np.array(dev_labels), predicted_logits=np.array(predicted_logits))
test_metrics.to_csv(os.path.join(results_dir, "roberta_metrics.csv"), index=False)
test_metrics.head()

Unnamed: 0,Accuracy,Precision,Macro Precision,Weighted Macro Precision,Recall,Macro Recall,Weighted Macro Recall,F1-Score,Macro F1-Score,Weighted Macro F1-Score,MCC,Loss
0,0.879917,0.879875,0.879919,0.879917,0.888729,0.879621,0.879917,0.88428,0.879746,0.879893,0.759539,1.751438
