In [1]:
cwk_dir ="drive/MyDrive/NLU Coursework/" #For running in Jack's Google Drive

# Setup

## Connect Google Drive Folder

In [2]:
from google.colab import drive
import os
import sys

drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


## Imports

In [3]:
sys.path.append(cwk_dir)
from classes.evaluation import evaluate
from classes.preprocessing import load_data

In [36]:
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler,random_split
from transformers import BertForSequenceClassification, BertForPreTraining, BertTokenizer, BertModel, get_constant_schedule_with_warmup, get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup
from torch.nn import Linear, AvgPool2d, CrossEntropyLoss, Dropout, Tanh

import torch

In [5]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import typing
from PIL import Image
import json
from nltk.corpus import stopwords
import gensim.downloader as api
from gensim.models import Word2Vec
import nltk
from nltk.tokenize import word_tokenize
import string

## Primary Variables

Filepath variables

In [6]:
solution_dir = os.path.join(cwk_dir, "solution_C")
models_dir = os.path.join(solution_dir, "models")
results_dir = os.path.join(solution_dir, "results")

Training variables

In [7]:
INITIAL_LR: float = 2e-5
EPOCHS: int = 4
VALIDATION_SPLIT: float = 0.2
BATCH_SIZE: int = 16

BERT_ID: str = 'bert-base-uncased'
NUM_LABELS: int = 2

BERT Keys

In [8]:
INPUTS_IDS_KEY: str = "input_ids"
ATTENTION_MASK_KEY: str = "attention_mask"
TOKEN_TYPE_KEY: str = "token_type_ids"

Other

In [9]:
MAX_SEQ_LENGTH: int = 512 #None is the value to denote that there is no max length. Max length is recommended
VOCAB_SIZE: int = None #None is the value to denote that there is no vocab size yet. This is set later, once we have the training data
EMBEDDING_SIZE: int = None

## Functions

In [10]:
def tokenize_data(tokenizer: BertTokenizer, premises: typing.List[str], hypotheses: typing.List[str], maxlen: int) ->typing.Tuple[np.array, np.array]:
  """
  Uses the input tokenizer to tokenizer the premises & hypotheses together. Will padd/truncate the sequences of tokens correctly. Formats the sequences together of the format below

      sample = [CLS] Premise [SEP] Hypothesis [SEP]
  """
  return tokenizer(premises, hypotheses, max_length=maxlen, padding="max_length", truncation=True, return_tensors="pt", add_special_tokens=True)

In [11]:
def get_accuracy(preds, labels) -> float:
  """
  Gets the accuracy between the predictions and labels. Returns this float
  """
  pred_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = labels.flatten()
  return np.sum(pred_flat == labels_flat) / len(labels_flat)

# Data Preprocessing

## Load Data

In [12]:
(train_premises, train_hypotheses, train_labels), (dev_premises, dev_hypotheses, dev_labels) = load_data(cwk_dir)

In [13]:
train_labels = torch.tensor([int(x) for x in train_labels])
dev_labels = torch.tensor([int(x) for x in dev_labels])

## Tokenize Data

In [14]:
tokenizer = BertTokenizer.from_pretrained(BERT_ID, do_lower_case=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [15]:
train_data = tokenize_data(tokenizer=tokenizer, premises=list(train_premises), hypotheses=list(train_hypotheses), maxlen=MAX_SEQ_LENGTH)
dev_data = tokenize_data(tokenizer=tokenizer, premises=list(dev_premises), hypotheses=list(dev_hypotheses), maxlen=MAX_SEQ_LENGTH) #Dev is used for evaluation

Example of a sentence:

In [16]:
print(f"Sentence: {tokenizer.convert_ids_to_tokens(train_data[INPUTS_IDS_KEY][0])}")
print(f"Tokens: {train_data[INPUTS_IDS_KEY]}")

Sentence: ['[CLS]', 'however', ',', 'fort', 'charles', 'was', 'rebuilt', 'as', 'a', 'military', 'and', 'naval', 'garrison', ',', 'and', 'it', 'protected', 'jamaica', 'and', 'much', 'of', 'the', 'english', 'caribbean', 'for', '250', 'years', 'until', 'the', 'advent', 'of', 'steamship', '##s', 'and', 'yet', 'another', 'earthquake', 'in', '1907', 'saw', 'its', 'decline', '.', '[SEP]', 'fort', 'charles', 'was', 'rebuilt', 'as', 'an', 'amusement', 'park', 'for', 'the', 'locals', '.', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[P

In [17]:
VOCAB_SIZE = tokenizer.vocab_size
print(f"Vocabulary size: {VOCAB_SIZE}")

Vocabulary size: 30522


## Format Dataset & Dataloader

In [18]:
dataset = TensorDataset(train_data[INPUTS_IDS_KEY], train_data[ATTENTION_MASK_KEY], train_data[TOKEN_TYPE_KEY], train_labels)
test_dataset = TensorDataset(dev_data[INPUTS_IDS_KEY], dev_data[ATTENTION_MASK_KEY], dev_data[TOKEN_TYPE_KEY], dev_labels) #note here that the dev dataset is used for testing (evaluation) later

In [19]:
train_dataset, val_dataset = random_split(dataset, [(1 - VALIDATION_SPLIT), VALIDATION_SPLIT])

In [20]:
train_dataloader = DataLoader(train_dataset, batch_size = BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size = BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size = BATCH_SIZE)

# Model Training

## Model Architecture
Inspiration:
- https://arxiv.org/pdf/2105.03791.pdf
- https://aclanthology.org/D15-1075.pdf

In [37]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [38]:
class BERT_NLI(torch.nn.Module):
  """
  Class for extended BERT models
  """
  def __init__(self, bert_model: BertForSequenceClassification, output_dim: int, hidden_dim: int = 100, dropout: float = 0.2):
    super().__init__()
    self.bert = bert_model
    embedding_dim = bert_model.config.to_dict()['hidden_size']

    self.hidden_linear = Linear(embedding_dim, hidden_dim)
    self.tanh = Tanh()
    self.dropout = Dropout(p=dropout)
    self.out = Linear(hidden_dim, output_dim)

  def forward(self, input_ids, attention_mask, token_type_ids):
    #Embed with BERT
    embedded = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)[1]

    #Pass on to further layers
    x = self.dropout(embedded)
    x = self.hidden_linear(x)
    x = self.tanh(x)
    x = self.dropout(x)
    output = self.out(x)
    return output

In [39]:
bert_model = BertModel.from_pretrained(BERT_ID)
model = BERT_NLI(bert_model=bert_model, output_dim=NUM_LABELS, hidden_dim = 100)
model = model.to(device)
model

BERT_NLI(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=T

## Learning Rate

In [40]:
EPOCHS = 6

In [41]:
OPTIM = torch.optim.AdamW(model.parameters(), lr=INITIAL_LR)
# SCHEDULER = get_constant_schedule_with_warmup(OPTIM, num_warmup_steps = int((len(train_dataset)//BATCH_SIZE) * EPOCHS))
SCHEDULER = get_linear_schedule_with_warmup(OPTIM, num_warmup_steps = int((len(train_dataset)//BATCH_SIZE) * 2), num_training_steps = int((len(train_dataset)//BATCH_SIZE) * EPOCHS))

## Model Training

In [42]:
#Loss metric
loss_function = CrossEntropyLoss().to(device)

In [43]:
patience = 2
min_delta = 0.05
best_val_loss = 1000
current_patience = 0

best_model_filename: str = "best_solution_C.pt"

In [44]:
for epoch in range(EPOCHS):
  ## Training
  model.train()
  total_loss = 0
  total_accuracy = 0
  for batch in train_dataloader:
    OPTIM.zero_grad()

    input_ids, attention_mask, token_type_ids, labels = [part.to(device) for part in batch]

    outputs = model(input_ids=input_ids,
                    attention_mask=attention_mask,
                    token_type_ids=token_type_ids)

    loss = loss_function(outputs, labels.squeeze())
    total_loss += loss.item()
    loss.backward()

    total_accuracy += get_accuracy(outputs.detach().cpu().numpy(), labels.to('cpu').numpy())

    OPTIM.step()
    SCHEDULER.step()

  avg_train_loss = total_loss / len(train_dataloader)
  avg_train_accuracy = total_accuracy / len(train_dataloader)
  print(f"Epoch {epoch+1}, Train Average Accuracy: {avg_train_accuracy}, Training Average Loss: {avg_train_loss}")

  ##Validation
  model.eval()
  total_val_accuracy = 0
  total_val_loss = 0

  for batch in val_dataloader:
    input_ids, attention_mask, token_type_ids, labels = [part.to(device) for part in batch]

    with torch.no_grad():
      outputs = model(input_ids = input_ids,
                      attention_mask = attention_mask,
                      token_type_ids=token_type_ids)

    loss = loss_function(outputs, labels.squeeze())
    total_val_loss  += loss.item()

    total_val_accuracy += get_accuracy(outputs.detach().cpu().numpy(), labels.to('cpu').numpy())

  avg_val_accuracy = total_val_accuracy / len(val_dataloader)
  avg_val_loss = total_val_loss / len(val_dataloader)

  print(f"Epoch {epoch+1}, Validation Average Accuracy: {avg_val_accuracy}, Validation Average Loss: {avg_val_loss}")

  #Early stopping
  if avg_val_loss < (best_val_loss - min_delta):
    best_val_loss = avg_val_loss
    current_patience = 0

    #Save the best model so far
    torch.save(model.state_dict(), os.path.join(models_dir, best_model_filename))
    print(f"Best model recorded at epoch {epoch+1}")
  else:
    current_patience += 1
    if current_patience >= patience:
        print(f"Early stopping at epoch {epoch+1}")
        break

Epoch 1, Train Average Accuracy: 0.6249072700296736, Training Average Loss: 0.6171493630093531
Epoch 1, Validation Average Accuracy: 0.8034124629080118, Validation Average Loss: 0.4436585069675474
Best model recorded at epoch 1
Epoch 2, Train Average Accuracy: 0.8318341988130564, Training Average Loss: 0.38674304228640805
Epoch 2, Validation Average Accuracy: 0.8275222551928784, Validation Average Loss: 0.3788495776561318
Best model recorded at epoch 2
Epoch 3, Train Average Accuracy: 0.921411350148368, Training Average Loss: 0.209876539520035
Epoch 3, Validation Average Accuracy: 0.8449554896142433, Validation Average Loss: 0.4355736363330961
Epoch 4, Train Average Accuracy: 0.9742210682492581, Training Average Loss: 0.07968740076764877
Epoch 4, Validation Average Accuracy: 0.8417408506429278, Validation Average Loss: 0.5385105169201353
Early stopping at epoch 4


In [45]:
torch.save(model.state_dict(), os.path.join(models_dir, "solution_C.pt"))

# Model Evaluation
Here we have balanced data & both classes are equally important. Therefore it is best to look at the macro-averaged performance metrics. Below details the metrics of:
- Accuracy
- Loss
- Precision
  - Macro
  - Weighted Macro
- Recall
  - Macro
  - Weighted Macro
- F-Score
  - Macro
  - Weighted Macro
- MCC


In [46]:
best_model = torch.load(os.path.join(models_dir, best_model_filename))

In [47]:
#Get the predictions for all of the test cases
predicted_labels = []

for batch in test_dataloader:
  input_ids, attention_mask, token_type_ids, labels = [part.to(device) for part in batch]

  with torch.no_grad():
    outputs = model(input_ids = input_ids,
                    attention_mask = attention_mask,
                    token_type_ids=token_type_ids)

  predicted_labels.extend(torch.argmax(outputs.detach().cpu(), dim=1).numpy())

In [48]:
test_metrics = evaluate(true_labels=np.array(dev_labels), predicted_labels=np.array(predicted_labels))
test_metrics.to_csv(os.path.join(results_dir, "metrics.csv"), index=False)
test_metrics.head()

Unnamed: 0,Accuracy,Precision,Macro Precision,Weighted Macro Precision,Recall,Macro Recall,Weighted Macro Recall,F1-Score,Macro F1-Score,Weighted Macro F1-Score,MCC,Loss
0,0.838949,0.84215,0.838822,0.83893,0.846751,0.838687,0.838949,0.844444,0.838748,0.838933,0.677509,5.804863
