In [42]:
from google.colab import drive
import os
import sys
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [43]:
cwk_dir ="drive/MyDrive/NLU Coursework/" #For running in Jack's Google Drive

logits_path = os.path.join(cwk_dir, "demo", "solution_C_predicted_logits.csv") #Path to evaluation logits
model_filepath = os.path.join(cwk_dir, "solution_C/models/roberta/best_solution_C.pt") #Path to the model to be demoed

dataset_path = os.path.join(cwk_dir, "data/training_data/training_data/NLI/dev.csv") #Path to the dataset used for evaluation
labels_path = os.path.join(cwk_dir, "demo", "Group_17_C.csv") #Path to output the predicted labels

# Setup

## Imports

In [44]:
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader

In [45]:
import numpy as np
import typing
import json
from random import randint
import typing
import pandas as pd
import os
from dataclasses import dataclass
import random
from nltk.corpus import wordnet, stopwords
from itertools import chain
import nltk

## Functions

In [46]:
@dataclass(frozen=True)
class GeneralKeys:
    """
    Dataclass for general keys for the process
    """

    PREMISE_KEY: str = "Premise"
    HYPOTHESIS_KEY: str = "Hypothesis"
    LABEL_KEY: str = "Label"
    LOSS_KEY: str = "Loss"
    PREDICTED_KEY: str = "Predicted Label"
    TRUE_KEY: str = "True Label"

@dataclass(frozen=True)
class DatasetKeys:
    """
    Dataclass associated with keys for the data csvs
    """

    PREMISE_KEY: str = GeneralKeys.PREMISE_KEY.lower()
    HYPOTHESIS_KEY: str = GeneralKeys.HYPOTHESIS_KEY.lower()
    LABEL_KEY: str = GeneralKeys.LABEL_KEY.lower()

def load_data_csv(
    filepath: str,
) -> typing.Tuple[typing.List[str], typing.List[str], typing.List[int]]:
    """
    Will load in data from the csv filepath specified. Expects the string filepath to a csv file. Returns tuple of the premises, hypotheses and labels
    """
    dataset = pd.read_csv(filepath)
    premises = dataset[DatasetKeys.PREMISE_KEY].astype(str).tolist()
    hypotheses = dataset[DatasetKeys.HYPOTHESIS_KEY].astype(str).tolist()
    labels = dataset[DatasetKeys.LABEL_KEY].astype(str).tolist()
    return (premises, hypotheses, labels)

In [47]:
def tokenize_data(tokenizer: RobertaTokenizer, premises: typing.List[str], hypotheses: typing.List[str], maxlen: int) ->typing.Tuple[np.array, np.array]:
  """
  Uses the input tokenizer to tokenizer the premises & hypotheses together. Will padd/truncate the sequences of tokens correctly. Formats the sequences together of the format below

      sample = [CLS] Premise [SEP] Hypothesis [SEP]
  """
  return tokenizer(premises, hypotheses, max_length=maxlen, padding="max_length", truncation=True, return_tensors="pt", add_special_tokens=True)

## Variables

In [48]:
INPUTS_IDS_KEY: str = "input_ids"
ATTENTION_MASK_KEY: str = "attention_mask"
TOKEN_TYPE_KEY: str = "token_type_ids"

In [49]:
NUM_LABELS = 2

# Data Preprocessing

## Load Data

In [50]:
premises, hypotheses, labels = load_data_csv(filepath=dataset_path)

In [51]:
labels = torch.tensor([int(x) for x in labels])

### Example

In [52]:
index = randint(0, len(premises))
print(f"Premise: {premises[index]}")
print(f"Hypothesis: {hypotheses[index]}")
print(f"Label: {labels[index]}")

Premise: Louis XIII had hoped to make his favorite hunting lodge a modest retirement home.
Hypothesis: Louis XIII did not retire to his favorite hunting lodge.
Label: 1


## Tokenize Data

In [53]:
BERT_ID: str = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(BERT_ID, do_lower_case=True)

In [54]:
demo_data = tokenize_data(tokenizer=tokenizer, premises=list(premises), hypotheses=list(hypotheses), maxlen=512)

In [55]:
VOCAB_SIZE = tokenizer.vocab_size
print(f"Vocabulary size: {VOCAB_SIZE}")

Vocabulary size: 50265


### Example

In [56]:
print(f"Sentence: {tokenizer.convert_ids_to_tokens(demo_data[INPUTS_IDS_KEY][0])}")
print(f"Tokens: {demo_data[INPUTS_IDS_KEY]}")

Sentence: ['<s>', 'Mon', 'ĠDie', 'u', '!', 'Ġ', '</s>', '</s>', 'This', 'Ġperson', 'Ġis', 'Ġspeaking', 'ĠEnglish', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>',

## Format Dataset & Dataloader

In [57]:
dataset = TensorDataset(demo_data[INPUTS_IDS_KEY], demo_data[ATTENTION_MASK_KEY], labels) #note here that the dev dataset is used for testing (evaluation) later
dataloader = DataLoader(dataset, batch_size = 16)

# Demo


## Model Architecture

In [58]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [59]:
model = RobertaForSequenceClassification.from_pretrained(BERT_ID, num_labels=NUM_LABELS)

model = model.to(device)
model

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [60]:
model.load_state_dict(torch.load(model_filepath))

<All keys matched successfully>

## Get Predictions

In [61]:
#Get the predictions for all of the test cases
predicted_logits = []

for batch in dataloader:
  input_ids, attention_mask, labels = [part.to(device) for part in batch]

  with torch.no_grad():
    outputs = model(input_ids = input_ids,
                    attention_mask = attention_mask,
                    labels=labels)

  predicted_logits.extend(outputs.logits.detach().cpu())

In [62]:
predicted_labels = np.argmax(predicted_logits, axis=1)

In [63]:
output_logits = pd.DataFrame(
    np.array(predicted_logits),
)
output_logits.to_csv(logits_path, index=False)
output_logits

Unnamed: 0,0,1
0,-2.542238,2.042342
1,2.397689,-2.344449
2,0.664036,-0.578207
3,-0.770370,0.579794
4,-2.549602,2.461269
...,...,...
6732,1.500810,-1.194499
6733,1.350769,-1.196378
6734,-3.167675,3.005512
6735,2.819711,-2.776475


In [64]:
output_labels = pd.DataFrame(
    predicted_labels,
    columns=["prediction"]
)
output_labels.to_csv(labels_path, index=False)
output_labels

Unnamed: 0,prediction
0,1
1,0
2,0
3,1
4,1
...,...
6732,0
6733,0
6734,1
6735,0
