In [1]:
%env PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True,garbage_collection_threshold:0.8

env: PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True,garbage_collection_threshold:0.8


# Setup

## Connect Google Drive Folder

In [2]:
from google.colab import drive
import os
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


## Imports

In [3]:
from tensorflow.keras import utils
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, Input, Add, Layer, LSTM, Bidirectional, Embedding, concatenate, BatchNormalization, SimpleRNN, Attention, GlobalMaxPooling1D, GlobalAveragePooling1D, Conv1D, MaxPooling1D, TimeDistributed
from tensorflow.keras.optimizers.legacy import SGD, Adam, RMSprop
from tensorflow.keras.utils import plot_model
from tensorflow.keras.callbacks import Callback
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from tensorflow.keras.metrics import SparseCategoricalAccuracy as Acc
from tensorflow.keras.metrics import SparseTopKCategoricalAccuracy as KAcc
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.regularizers import l2

import keras

In [4]:
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler,random_split
import torch
from transformers import BertForSequenceClassification, BertForPreTraining

In [5]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import typing
from PIL import Image
import json
from nltk.corpus import stopwords
import gensim.downloader as api
from gensim.models import Word2Vec
import nltk
from nltk.tokenize import word_tokenize
import string

In [6]:
from transformers import BertTokenizer, BertModel

## Primary Variables

Filepath variables

In [7]:
cwk_dir ="drive/MyDrive/NLU Coursework/"

data_dir = os.path.join(cwk_dir, "data")

solution_dir = os.path.join(cwk_dir, "solution C")
models_dir = os.path.join(solution_dir, "models")
results_dir = os.path.join(solution_dir, "results")

In [8]:
TRAIN_FILEPATH: str = os.path.join(data_dir, "training_data/training_data/NLI")
TRAIN_DATASET: str = os.path.join(TRAIN_FILEPATH, "train.csv")
DEV_DATASET: str = os.path.join(TRAIN_FILEPATH, "dev.csv")

TRIAL_FILEPATH: str = os.path.join(data_dir, "trial_data/trial_data")
TRIAL_DATASET: str = os.path.join(TRIAL_FILEPATH, "NLI_trial.csv")

Label variables

In [9]:
PREMISE_KEY: str = "premise"
HYPOTHESIS_KEY: str = "hypothesis"
LABEL_KEY: str = "label"

Preprocessing variables

In [10]:
LOWER: bool = True
PADDING: str = "post"

nltk.download('stopwords')
STOP_WORDS = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Training variables

In [11]:
INITIAL_LR: float = 2e-5
EPOCHS: int = 4
VALIDATION_SPLIT: float = 0.2
BATCH_SIZE: int = 16

DROPOUT: float = 0.25

# OPTIMIZER = RMSprop(INITIAL_LR)
OPTIMIZER = "adam"

BERT_ID: str = 'bert-base-uncased'
NUM_LABELS: int = 2

Other

In [12]:
MAX_SEQ_LENGTH: int = 512 #None is the value to denote that there is no max length. Max length is recommended
VOCAB_SIZE: int = None #None is the value to denote that there is no vocab size yet. This is set later, once we have the training data
EMBEDDING_SIZE: int = None

## Functions

In [13]:
def load_data_csv(filepath: str) -> typing.Tuple[typing.List[str], typing.List[str], typing.List[int]]:
  """
  Will load in data from the filepath specified. Expects the string filepath to a csv file. Returns tuple of the premises, hypotheses and labels
  """
  dataset = pd.read_csv(filepath).to_dict()
  premises = list(map(str, dataset[PREMISE_KEY].values()))
  hypotheses = list(map(str, dataset[HYPOTHESIS_KEY].values()))
  labels = list(map(int, dataset[LABEL_KEY].values()))
  return (premises, hypotheses,labels)

In [14]:
def show_model_architecture(model: Model, filename: str) -> None:
  """
  Takes a model architecture and will a diagram of the architecture. Saves this diagram also, to the filename specified
  """
  model.summary()
  filename = os.path.join(results_dir, filename)
  plot_model(model, to_file=filename)
  img = Image.open(filename)
  fig, ax = plt.subplots(figsize=(15, 15))
  plt.imshow(img, aspect='equal')

In [15]:
def tokenize_data(tokenizer: BertTokenizer, premises: typing.List[str], hypotheses: typing.List[str]) ->typing.Tuple[np.array, np.array]:
  """
  Uses the input tokenizer to tokenizer the premises & hypotheses together. Will padd/truncate the sequences of tokens correctly. Formats the sequences together of the format below

      sample = [CLS] Premise [SEP] Hypothesis [SEP]
  """
  return tokenizer(premises, hypotheses, max_length=MAX_SEQ_LENGTH, padding="max_length", truncation=True, return_tensors="pt", add_special_tokens=True)

In [16]:
def clean_sentences(sentences: typing.List[str]) -> typing.List[str]:
  """
  Takes a list of sentences & cleans them. Remove stopwords, reduces to lower case, removes punctuation
  """
  sentences = [[word.lower().translate(str.maketrans("","",string.punctuation)) for word in sentence.split(" ") if word not in STOP_WORDS] for sentence in sentences]
  sentences = np.array([" ".join(sentence) for sentence in sentences])
  return sentences


In [17]:
def plot_history(history, export_path, legend: typing.List[str] = ['train', 'test']):
  """
  Will plot the history of a model, labelling it appropriately
  """
  plt.figure(figsize=(20, 10))
  plt.subplot(1, 2, 1)

  plt.plot(history.history['accuracy'])
  plt.plot(history.history['val_accuracy'])
  plt.title('model accuracy')
  plt.ylabel('accuracy')
  plt.xlabel('epoch')
  plt.legend(legend, loc='upper left')

  plt.subplot(1, 2, 2)
  plt.plot(history.history['loss'])
  plt.plot(history.history['val_loss'])
  plt.title('model loss')
  plt.ylabel('loss')
  plt.xlabel('epoch')
  plt.legend(legend, loc='upper left')

  plt.savefig(os.path.join(results_dir, export_path))

  plt.show()


In [18]:
def get_accuracy(preds, labels) -> float:
  """
  Gets the accuracy between the predictions and labels. Returns this float
  """
  pred_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = labels.flatten()
  return np.sum(pred_flat == labels_flat) / len(labels_flat)

# Data Preprocessing

## Load Data

In [19]:
train_premises, train_hypotheses, train_labels = load_data_csv(filepath=TRAIN_DATASET)
dev_premises, dev_hypotheses, dev_labels = load_data_csv(filepath=DEV_DATASET)

In [20]:
train_labels = torch.reshape(torch.tensor(train_labels), (len(train_labels),1))
dev_labels = torch.reshape(torch.tensor(dev_labels), (len(dev_labels),1))

In [21]:
#Removes stop words, reduces to lower case and will shorten to the max length
train_premises = clean_sentences(sentences=train_premises)
train_hypotheses = clean_sentences(sentences=train_hypotheses)

dev_premises = clean_sentences(sentences=dev_premises)
dev_hypotheses = clean_sentences(sentences=dev_hypotheses)

## Tokenize Data

In [22]:
tokenizer = BertTokenizer.from_pretrained(BERT_ID, do_lower_case=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [23]:
train_data = tokenize_data(tokenizer=tokenizer, premises=list(train_premises), hypotheses=list(train_hypotheses))
dev_data = tokenize_data(tokenizer=tokenizer, premises=list(dev_premises), hypotheses=list(dev_hypotheses))

Example of a sentence:

In [24]:
print(f"Sentence: {tokenizer.convert_ids_to_tokens(train_data['input_ids'][0])}")
print(f"Tokens: {train_data['input_ids']}")

Sentence: ['[CLS]', 'however', 'fort', 'charles', 'rebuilt', 'military', 'naval', 'garrison', 'protected', 'jamaica', 'much', 'english', 'caribbean', '250', 'years', 'advent', 'steamship', '##s', 'yet', 'another', 'earthquake', '1907', 'saw', 'decline', '[SEP]', 'fort', 'charles', 'rebuilt', 'amusement', 'park', 'locals', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '

In [25]:
VOCAB_SIZE = tokenizer.vocab_size
print(f"Vocabulary size: {VOCAB_SIZE}")

Vocabulary size: 30522


## Format Dataset & Data Split

In [26]:
dataset = TensorDataset(train_data["input_ids"], train_data["attention_mask"], train_data["token_type_ids"], train_labels)
test_dataset = TensorDataset(dev_data["input_ids"], dev_data["attention_mask"], dev_data["token_type_ids"], dev_labels)

In [27]:
train_dataset, val_dataset = random_split(dataset, [(1 - VALIDATION_SPLIT), VALIDATION_SPLIT])

# Model Training

In [28]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

## Dataloader

In [29]:
train_dataloader = DataLoader(train_dataset, batch_size = BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size = BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size = BATCH_SIZE)

## Model Architecture

In [30]:
model = BertForSequenceClassification.from_pretrained(BERT_ID, num_labels = 2, output_attentions = False, output_hidden_states = False)
model = model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
# class BERTNLIModel(torch.nn.Module):
#   def __init__(self, bert_model, output_dim):
#     super().__init__()
#     self.bert = bert_model
#     embedding_dim = bert_model.config.to_dict()['hidden_size']
#     self.out = torch.nn.Linear(embedding_dim, output_dim)
#   def forward(self, input_ids, attention_mask, token_type_ids):
#     embedded = self.bert(input_ids = input_ids, attention_mask =
#                       attention_mask, token_type_ids = token_type_ids)[1]
#     output = self.out(embedded)
#     return output
# model = BERTNLIModel(bert_model=bert_model, output_dim=NUM_LABELS)
# model = model.to(device)

## Learning Rate

In [32]:
optimizer = torch.optim.AdamW(model.parameters(), lr=INITIAL_LR)

In [33]:
# criterion = torch.nn.BCEWithLogitsLoss()
# criterion = torch.nn.CrossEntropyLoss().to(device)

## Model Training

In [34]:
for epoch in range(EPOCHS):
  ## Training
  model.train()
  total_loss = 0
  total_accuracy = 0
  for batch in train_dataloader:
    optimizer.zero_grad()

    input_ids, attention_mask, token_type_ids, labels = [part.to(device) for part in batch]

    outputs = model(input_ids=input_ids,
                    attention_mask=attention_mask,
                    token_type_ids=token_type_ids,
                    labels=labels)

    loss = outputs.loss
    total_loss += loss.item()
    loss.backward()

    total_accuracy += get_accuracy(outputs.logits.detach().cpu().numpy(), labels.to('cpu').numpy())

    optimizer.step()

  avg_train_loss = total_loss / len(train_dataloader)
  avg_train_accuracy = total_accuracy / len(train_dataloader)
  print(f"Epoch {epoch+1}, Train Average Accuracy: {avg_train_accuracy}, Training Average Loss: {avg_train_loss}")

  ##Validation
  model.eval()
  total_val_accuracy = 0
  best_val_accuracy = 0
  total_val_loss = 0

  for batch in val_dataloader:
    input_ids, attention_mask, token_type_ids, labels = [part.to(device) for part in batch]

    with torch.no_grad():
      output = model(input_ids = input_ids,
                      attention_mask = attention_mask,
                      token_type_ids=token_type_ids,
                      labels=labels)

    loss = output.loss
    total_val_loss  += loss.item()

    total_val_accuracy += get_accuracy(output.logits.detach().cpu().numpy(), labels.to('cpu').numpy())

  avg_val_accuracy = total_val_accuracy / len(val_dataloader)
  avg_val_loss = total_val_loss / len(val_dataloader)
  print(f"Epoch {epoch+1}, Validation Average Accuracy: {avg_val_accuracy}, Validation Average Loss: {avg_val_loss}")

Epoch 1, Train Average Accuracy: 0.6658475519287834, Training Average Loss: 0.6001692865629755
Epoch 1, Validation Average Accuracy: 0.7385633036597429, Validation Average Loss: 0.5228771577514597
Epoch 2, Train Average Accuracy: 0.8179247032640949, Training Average Loss: 0.4158046602517455
Epoch 2, Validation Average Accuracy: 0.7438798219584569, Validation Average Loss: 0.5291005850766816
Epoch 3, Train Average Accuracy: 0.916682121661721, Training Average Loss: 0.2193283186368755
Epoch 3, Validation Average Accuracy: 0.7485163204747775, Validation Average Loss: 0.6654952963266952
Epoch 4, Train Average Accuracy: 0.9656899109792285, Training Average Loss: 0.09807763361193804
Epoch 4, Validation Average Accuracy: 0.7469090009891197, Validation Average Loss: 0.8864083666940294


In [35]:
torch.save(model.state_dict(), os.path.join(models_dir, "solution_C.pt"))

## Model Evaluation

In [37]:
total_test_accuracy = 0
total_test_loss = 0
best_test_accuracy = 0

for batch in test_dataloader:
  input_ids, attention_mask, token_type_ids, labels = [part.to(device) for part in batch]

  with torch.no_grad():
    output = model(input_ids = input_ids,
                    attention_mask = attention_mask,
                    token_type_ids=token_type_ids,
                    labels=labels)

  loss = output.loss
  total_test_loss  += loss.item()

  total_test_accuracy += get_accuracy(output.logits.detach().cpu().numpy(), labels.to('cpu').numpy())


avg_test_accuracy = total_test_accuracy / len(test_dataloader)
avg_test_loss = total_val_loss / len(test_dataloader)
print(f"Test Average Accuracy: {avg_test_accuracy}")
print(f"Test Average Loss: {avg_test_loss}")

Test Average Accuracy: 0.757553317535545
Test Average Loss: 0.7078663970992605
