In [1]:
from google.colab import drive
import os
import sys
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
model_filepath = "drive/MyDrive/NLU Coursework/solution_C/models_C/solution_C.pt" #Path to the model to be demoed
dataset_path = "drive/MyDrive/NLU Coursework/data/test_data/test.csv" #Path to the dataset used for evaluation
labels_path = "drive/MyDrive/NLU Coursework/submissions/Group_17_C.csv" #Path to output the predicted labels

# Setup

## Imports

In [3]:
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader

In [4]:
import numpy as np
import typing
import json
from random import randint
import typing
import pandas as pd
import os
import random
from itertools import chain
import nltk

## Functions

In [5]:
def load_data_csv(
    filepath: str,
) -> typing.Tuple[typing.List[str], typing.List[str], typing.List[int]]:
    """
    Will load in data from the csv filepath specified. Expects the string filepath to a csv file. Returns tuple of the premises, hypotheses and labels
    """
    dataset = pd.read_csv(filepath)
    premises = dataset["premise"].astype(str).tolist()
    hypotheses = dataset["hypothesis"].astype(str).tolist()
    return (premises, hypotheses)

In [6]:
def tokenize_data(tokenizer: RobertaTokenizer, premises: typing.List[str], hypotheses: typing.List[str], maxlen: int) ->typing.Tuple[np.array, np.array]:
  """
  Uses the input tokenizer to tokenizer the premises & hypotheses together. Will padd/truncate the sequences of tokens correctly. Formats the sequences together of the format below

      sample = [CLS] Premise [SEP] Hypothesis [SEP]
  """
  return tokenizer(premises, hypotheses, max_length=maxlen, padding="max_length", truncation=True, return_tensors="pt", add_special_tokens=True)

## Variables

In [7]:
INPUTS_IDS_KEY: str = "input_ids"
ATTENTION_MASK_KEY: str = "attention_mask"
TOKEN_TYPE_KEY: str = "token_type_ids"

In [8]:
NUM_LABELS = 2

# Data Preprocessing

## Load Data

In [9]:
premises, hypotheses = load_data_csv(filepath=dataset_path)

### Example

In [10]:
index = randint(0, len(premises))
print(f"Premise: {premises[index]}")
print(f"Hypothesis: {hypotheses[index]}")

Premise: A woman wearing white and black carries a serving tray.
Hypothesis: A woman wearing white and black grips a steering wheel with both hands as she evades the police.


## Tokenize Data

In [11]:
BERT_ID: str = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(BERT_ID, do_lower_case=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [12]:
demo_data = tokenize_data(tokenizer=tokenizer, premises=list(premises), hypotheses=list(hypotheses), maxlen=512)

In [13]:
VOCAB_SIZE = tokenizer.vocab_size
print(f"Vocabulary size: {VOCAB_SIZE}")

Vocabulary size: 50265


### Example

In [14]:
print(f"Sentence: {tokenizer.convert_ids_to_tokens(demo_data[INPUTS_IDS_KEY][0])}")
print(f"Tokens: {demo_data[INPUTS_IDS_KEY]}")

Sentence: ['<s>', 'Boy', 'Ġwearing', 'Ġred', 'Ġhat', ',', 'Ġblue', 'Ġjacket', 'Ġpushing', 'Ġpl', 'ow', 'Ġin', 'Ġsnow', '.', '</s>', '</s>', 'The', 'Ġboy', 'Ġis', 'Ġsurrounded', 'Ġby', 'Ġsnow', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>'

## Format Dataset & Dataloader

In [15]:
dataset = TensorDataset(demo_data[INPUTS_IDS_KEY], demo_data[ATTENTION_MASK_KEY], torch.zeros(len(premises),2)) #note here that the dev dataset is used for testing (evaluation) later
dataloader = DataLoader(dataset, batch_size = 16)

# Demo


## Load Model

In [16]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [17]:
model = RobertaForSequenceClassification.from_pretrained(BERT_ID, num_labels=NUM_LABELS)

model = model.to(device)
model

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [18]:
model.load_state_dict(torch.load(model_filepath))

<All keys matched successfully>

## Get Predictions

In [19]:
#Get the predictions for all of the test cases
predicted_logits = []

with torch.no_grad():
  for input_ids, attention_mask in zip(demo_data[INPUTS_IDS_KEY], demo_data[ATTENTION_MASK_KEY]):
    output = model(input_ids = input_ids.to(device).unsqueeze(0),
                    attention_mask = attention_mask.to(device).unsqueeze(0))
    predicted_logits.append(output.logits[0].detach().cpu())

In [20]:
predicted_labels = np.argmax(predicted_logits, axis=1)

In [21]:
output_labels = pd.DataFrame(
    predicted_labels,
    columns=["prediction"]
)
output_labels.to_csv(labels_path, index=False)
output_labels

Unnamed: 0,prediction
0,1
1,1
2,1
3,1
4,1
...,...
3297,1
3298,0
3299,1
3300,0
