# Setup

## Connect Google Drive Folder

In [1]:
from google.colab import drive
import os
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


## Imports

In [80]:
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler,random_split
from transformers import BertForSequenceClassification, BertForPreTraining, BertTokenizer, BertModel
from torch.nn import Linear, AvgPool2d, CrossEntropyLoss

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, precision_recall_fscore_support

import torch

In [72]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import typing
from PIL import Image
import json
from nltk.corpus import stopwords
import gensim.downloader as api
from gensim.models import Word2Vec
import nltk
from nltk.tokenize import word_tokenize
import string
import pandas as pd
from dataclasses import dataclass

## Primary Variables

Filepath variables

In [36]:
cwk_dir ="drive/MyDrive/NLU Coursework/"

data_dir = os.path.join(cwk_dir, "data")

solution_dir = os.path.join(cwk_dir, "solution_C")
models_dir = os.path.join(solution_dir, "models")
results_dir = os.path.join(solution_dir, "results")

In [6]:
TRAIN_FILEPATH: str = os.path.join(data_dir, "training_data/training_data/NLI")
TRAIN_DATASET: str = os.path.join(TRAIN_FILEPATH, "train.csv")
DEV_DATASET: str = os.path.join(TRAIN_FILEPATH, "dev.csv")

TRIAL_FILEPATH: str = os.path.join(data_dir, "trial_data/trial_data")
TRIAL_DATASET: str = os.path.join(TRIAL_FILEPATH, "NLI_trial.csv")

Label variables

In [7]:
PREMISE_KEY: str = "premise"
HYPOTHESIS_KEY: str = "hypothesis"
LABEL_KEY: str = "label"

Training variables

In [9]:
INITIAL_LR: float = 2e-5
EPOCHS: int = 4
VALIDATION_SPLIT: float = 0.2
BATCH_SIZE: int = 16

BERT_ID: str = 'bert-base-uncased'
NUM_LABELS: int = 2

BERT Keys

In [10]:
INPUTS_IDS_KEY: str = "input_ids"
ATTENTION_MASK_KEY: str = "attention_mask"
TOKEN_TYPE_KEY: str = "token_type_ids"

Other

In [11]:
MAX_SEQ_LENGTH: int = 512 #None is the value to denote that there is no max length. Max length is recommended
VOCAB_SIZE: int = None #None is the value to denote that there is no vocab size yet. This is set later, once we have the training data
EMBEDDING_SIZE: int = None

## Functions

In [12]:
def load_data_csv(filepath: str) -> typing.Tuple[typing.List[str], typing.List[str], typing.List[int]]:
  """
  Will load in data from the filepath specified. Expects the string filepath to a csv file. Returns tuple of the premises, hypotheses and labels
  """
  dataset = pd.read_csv(filepath).to_dict()
  premises = list(map(str, dataset[PREMISE_KEY].values()))
  hypotheses = list(map(str, dataset[HYPOTHESIS_KEY].values()))
  labels = list(map(int, dataset[LABEL_KEY].values()))
  return (premises, hypotheses,labels)

In [13]:
def tokenize_data(tokenizer: BertTokenizer, premises: typing.List[str], hypotheses: typing.List[str], maxlen: int) ->typing.Tuple[np.array, np.array]:
  """
  Uses the input tokenizer to tokenizer the premises & hypotheses together. Will padd/truncate the sequences of tokens correctly. Formats the sequences together of the format below

      sample = [CLS] Premise [SEP] Hypothesis [SEP]
  """
  return tokenizer(premises, hypotheses, max_length=maxlen, padding="max_length", truncation=True, return_tensors="pt", add_special_tokens=True)

In [85]:
class MacroMetric:
  """
  Dataclass for metrics that can be turned into macro & weighted macro
  """
  NORMAL_KEY: str
  MACRO_KEY: str
  WEIGHTED_KEY: str

  def __init__(self, key: str):
    self.NORMAL_KEY: str = key
    self.MACRO_KEY: str = f"Macro {key}"
    self.WEIGHTED_KEY: str = f"Weighted Macro {key}"

@dataclass(frozen=True)
class MetricKeys:
  """
  Dataclass associated with keys for the evaluation metrics
  """
  ACCURACY_KEY: str = "Accuracy"
  PRECISION: MacroMetric = MacroMetric(key="Precision")
  F1: MacroMetric = MacroMetric(key="F1-Score")
  RECALL: MacroMetric = MacroMetric(key="Recall")
  MCC_KEY: str = "MCC"

def get_metrics(true_labels: np.array, predicted_labels: np.array) -> pd.DataFrame:
  """
  Uses the true and predicted labels & sklearn to create extensive evaluation metrics. Formats into a dataframe that it displays & returns
  """
  accuracy = accuracy_score(true_labels, predicted_labels)

  precision, recall, f1, support = precision_recall_fscore_support(true_labels, predicted_labels)

  weighted_precision = np.average(precision, weights=support)
  weighted_recall = np.average(recall, weights=support)
  weighted_f1 = np.average(f1, weights=support)

  precision = precision_score(true_labels, predicted_labels)
  recall = recall_score(true_labels, predicted_labels)
  f1 = f1_score(true_labels, predicted_labels)

  macro_precision = precision_score(true_labels, predicted_labels, average='macro')
  macro_recall = recall_score(true_labels, predicted_labels, average='macro')
  macro_f1 = f1_score(true_labels, predicted_labels, average='macro')

  mcc = matthews_corrcoef(true_labels, predicted_labels)

  #Format into dataframe for easier viewing
  df = pd.DataFrame([[accuracy,
                      precision, macro_precision, weighted_precision,
                      recall, macro_recall, weighted_recall,
                      f1, macro_f1, weighted_f1,
                      mcc]],
                    columns=[MetricKeys.ACCURACY_KEY,
                             MetricKeys.PRECISION.NORMAL_KEY, MetricKeys.PRECISION.MACRO_KEY, MetricKeys.PRECISION.WEIGHTED_KEY,
                             MetricKeys.RECALL.NORMAL_KEY, MetricKeys.RECALL.MACRO_KEY, MetricKeys.RECALL.WEIGHTED_KEY,
                             MetricKeys.F1.NORMAL_KEY, MetricKeys.F1.MACRO_KEY, MetricKeys.F1.WEIGHTED_KEY,
                             MetricKeys.MCC_KEY])
  return df

# Data Preprocessing

## Load Data

In [17]:
train_premises, train_hypotheses, train_labels = load_data_csv(filepath=TRAIN_DATASET)
dev_premises, dev_hypotheses, dev_labels = load_data_csv(filepath=DEV_DATASET)

In [18]:
train_labels = torch.tensor(train_labels)
dev_labels = torch.tensor(dev_labels)

## Tokenize Data

In [20]:
tokenizer = BertTokenizer.from_pretrained(BERT_ID, do_lower_case=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [21]:
train_data = tokenize_data(tokenizer=tokenizer, premises=list(train_premises), hypotheses=list(train_hypotheses), maxlen=MAX_SEQ_LENGTH)
dev_data = tokenize_data(tokenizer=tokenizer, premises=list(dev_premises), hypotheses=list(dev_hypotheses), maxlen=MAX_SEQ_LENGTH) #Dev is used for evaluation

Example of a sentence:

In [22]:
print(f"Sentence: {tokenizer.convert_ids_to_tokens(train_data[INPUTS_IDS_KEY][0])}")
print(f"Tokens: {train_data[INPUTS_IDS_KEY]}")

Sentence: ['[CLS]', 'however', ',', 'fort', 'charles', 'was', 'rebuilt', 'as', 'a', 'military', 'and', 'naval', 'garrison', ',', 'and', 'it', 'protected', 'jamaica', 'and', 'much', 'of', 'the', 'english', 'caribbean', 'for', '250', 'years', 'until', 'the', 'advent', 'of', 'steamship', '##s', 'and', 'yet', 'another', 'earthquake', 'in', '1907', 'saw', 'its', 'decline', '.', '[SEP]', 'fort', 'charles', 'was', 'rebuilt', 'as', 'an', 'amusement', 'park', 'for', 'the', 'locals', '.', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[P

In [23]:
VOCAB_SIZE = tokenizer.vocab_size
print(f"Vocabulary size: {VOCAB_SIZE}")

Vocabulary size: 30522


## Format Dataset & Dataloader

In [24]:
dataset = TensorDataset(train_data[INPUTS_IDS_KEY], train_data[ATTENTION_MASK_KEY], train_data[TOKEN_TYPE_KEY], train_labels)
test_dataset = TensorDataset(dev_data[INPUTS_IDS_KEY], dev_data[ATTENTION_MASK_KEY], dev_data[TOKEN_TYPE_KEY], dev_labels) #note here that the dev dataset is used for testing (evaluation) later

In [25]:
train_dataset, val_dataset = random_split(dataset, [(1 - VALIDATION_SPLIT), VALIDATION_SPLIT])

In [26]:
train_dataloader = DataLoader(train_dataset, batch_size = BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size = BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size = BATCH_SIZE)

# Model Training

## Model Architecture

In [27]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [28]:
class BERT_NLI(torch.nn.Module):
  """
  Class for extended BERT models
  """
  def __init__(self, bert_model: BertForSequenceClassification, output_dim: int, hidden_dim: int = 100):
    super().__init__()
    self.bert = bert_model
    embedding_dim = bert_model.config.to_dict()['hidden_size']

    self.hidden_linear = Linear(embedding_dim, hidden_dim)
    self.out = Linear(hidden_dim, output_dim)

  def forward(self, input_ids, attention_mask, token_type_ids):
    #Embed with BERT
    embedded = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)[1]

    #Pass on to further layers
    x = self.hidden_linear(embedded)
    output = self.out(x)
    return output

In [29]:
bert_model = BertModel.from_pretrained(BERT_ID)
model = BERT_NLI(bert_model=bert_model, output_dim=NUM_LABELS)
model = model.to(device)
model

BERT_NLI(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=T

## Learning Rate

In [30]:
OPTIM = torch.optim.AdamW(model.parameters(), lr=INITIAL_LR)

## Model Training

In [31]:
#Loss metric
loss_function = CrossEntropyLoss().to(device)

In [33]:
for epoch in range(EPOCHS):
  ## Training
  model.train()
  total_loss = 0
  total_accuracy = 0
  for batch in train_dataloader:
    OPTIM.zero_grad()

    input_ids, attention_mask, token_type_ids, labels = [part.to(device) for part in batch]

    outputs = model(input_ids=input_ids,
                    attention_mask=attention_mask,
                    token_type_ids=token_type_ids)

    loss = loss_function(outputs, labels.squeeze())
    total_loss += loss.item()
    loss.backward()

    total_accuracy += get_accuracy(outputs.detach().cpu().numpy(), labels.to('cpu').numpy())

    OPTIM.step()

  avg_train_loss = total_loss / len(train_dataloader)
  avg_train_accuracy = total_accuracy / len(train_dataloader)
  print(f"Epoch {epoch+1}, Train Average Accuracy: {avg_train_accuracy}, Training Average Loss: {avg_train_loss}")

  ##Validation
  model.eval()
  total_val_accuracy = 0
  best_val_accuracy = 0
  total_val_loss = 0

  for batch in val_dataloader:
    input_ids, attention_mask, token_type_ids, labels = [part.to(device) for part in batch]

    with torch.no_grad():
      outputs = model(input_ids = input_ids,
                      attention_mask = attention_mask,
                      token_type_ids=token_type_ids)

    loss = loss_function(outputs, labels.squeeze())
    total_val_loss  += loss.item()

    total_val_accuracy += get_accuracy(outputs.detach().cpu().numpy(), labels.to('cpu').numpy())

  avg_val_accuracy = total_val_accuracy / len(val_dataloader)
  avg_val_loss = total_val_loss / len(val_dataloader)
  print(f"Epoch {epoch+1}, Validation Average Accuracy: {avg_val_accuracy}, Validation Average Loss: {avg_val_loss}")

Epoch 1, Train Average Accuracy: 0.7535701038575667, Training Average Loss: 0.4871030089940091
Epoch 1, Validation Average Accuracy: 0.8286350148367952, Validation Average Loss: 0.3840154116249226
Epoch 2, Train Average Accuracy: 0.8952151335311572, Training Average Loss: 0.26323975030697594
Epoch 2, Validation Average Accuracy: 0.8379080118694362, Validation Average Loss: 0.39922536002024345
Epoch 3, Train Average Accuracy: 0.9610070474777448, Training Average Loss: 0.11089881508385221
Epoch 3, Validation Average Accuracy: 0.8403808110781404, Validation Average Loss: 0.4901121566417552
Epoch 4, Train Average Accuracy: 0.9796921364985163, Training Average Loss: 0.05999692285494555
Epoch 4, Validation Average Accuracy: 0.8350642927794264, Validation Average Loss: 0.6275315359773705


In [37]:
torch.save(model.state_dict(), os.path.join(models_dir, "solution_C.pt"))

# Model Evaluation
Here we have balanced data & both classes are equally important. Therefore it is best to look at the macro-averaged performance metrics. Below details the metrics of:
- Accuracy
- Loss
- Precision
  - Macro
  - Weighted Macro
- Recall
  - Macro
  - Weighted Macro
- F-Score
  - Macro
  - Weighted Macro
- MCC


In [52]:
#Get the predictions for all of the test cases
predicted_labels = []
total_test_loss = 0

for batch in test_dataloader:
  input_ids, attention_mask, token_type_ids, labels = [part.to(device) for part in batch]

  with torch.no_grad():
    outputs = model(input_ids = input_ids,
                    attention_mask = attention_mask,
                    token_type_ids=token_type_ids)

  loss = loss_function(outputs, labels)
  total_test_loss  += loss.item()

  predicted_labels.extend(torch.argmax(outputs.detach().cpu(), dim=1).numpy())

In [87]:
#Loss
avg_test_loss = total_val_loss / len(test_dataloader)
print(f"Average Test loss: {avg_test_loss}")

#Other metrics
test_metrics = get_metrics(true_labels=np.array(dev_labels), predicted_labels=np.array(predicted_labels))
test_metrics.head()
test_metrics.to_csv(os.path.join(results_dir, "metrics.csv"), index=False)

Average Test loss: 0.5011330038492272


OSError: Cannot save file into a non-existent directory: 'drive/MyDrive/NLU Coursework/solution_C/results'