In [1]:
from google.colab import drive
import os
import sys
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
cls_filepath ="drive/MyDrive/NLU Coursework/classes" #Path to the classes folder

model_weight_filepath = "drive/MyDrive/NLU Coursework/solution_B/models_B/solution_B.hdf5" #Path to the model weights to be demoed
model_arch_filepath = "drive/MyDrive/NLU Coursework/solution_B/models_B/solution_B.json" #Path to the model architecture to be demoed

dataset_path = "drive/MyDrive/NLU Coursework/data/training_data/training_data/NLI/dev.csv" #Path to the dataset used for evaluation
labels_path = "drive/MyDrive/NLU Coursework/submissions/Group_17_B.csv" #Path to output the predicted labels

# Evaluation Setup

In [3]:
sys.path.append(cls_filepath) #Path to import the helper methods & classes from. Classes directory must be within this folder
from evaluation import evaluate, draw_confusion_matrix, most_confused_samples
from preprocessing import load_data_csv

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import keras
from transformers import XLNetTokenizer, XLNetModel

In [5]:
import json
from random import randint
import os
import random
from nltk.corpus import wordnet, stopwords
from itertools import chain
import nltk
import typing
import numpy as np
import pandas as pd

In [6]:
PADDING: str = "post"
def tokenize_data(tokenizer: XLNetTokenizer, premises: typing.List[str], hypotheses: typing.List[str]) ->typing.Tuple[np.array, np.array]:
  """
  Uses the input tokenizer to tokenizer the premises & hypotheses. Will then pad the sequences correctly, using the maxlen passed in
  """
  premises = [tokenizer.encode(premise, add_special_tokens=True) for premise in premises]
  hypotheses = [tokenizer.encode(hypothesis, add_special_tokens=True) for hypothesis in hypotheses]

  premises = pad_sequences(premises, maxlen=MAX_PREMISE_LENGTH, padding=PADDING, value=0)
  hypotheses = pad_sequences(hypotheses, maxlen=MAX_HYPOTHESIS_LENGTH, padding=PADDING, value=0)

  return (np.array(premises), np.array(hypotheses))

# Data Preprocessing

In [7]:
premises, hypotheses, true = load_data_csv(filepath=dataset_path)
true = np.array([int(x) for x in true])

In [8]:
premises_og = premises
hypotheses_og = hypotheses

In [9]:
MAX_PREMISE_LENGTH: int = 110
MAX_HYPOTHESIS_LENGTH: int = 60

print(f"MAX_PREMISE_LENGTH: {MAX_PREMISE_LENGTH}")
print(f"MAX_HYPOTHESIS_LENGTH: {MAX_HYPOTHESIS_LENGTH}")

MAX_PREMISE_LENGTH: 110
MAX_HYPOTHESIS_LENGTH: 60


In [10]:
labels = to_categorical(true)

### Example

In [11]:
index = randint(0, len(premises))
print(f"Premise: {premises[index]}")
print(f"Hypothesis: {hypotheses[index]}")
print(f"Label: {labels[index]}")

Premise: With funding from the Lawyers Trust Fund of Illinois, the Technology Working Group and representatives from CARPLS (the Chicago-based hotline and referral services), Legal Assistance Foundation of Metropolitan Chicago, Prairie State Legal Services and Land of Lincoln Legal Assistance Foundation have formed a Best Practices group.
Hypothesis: The Lawyers Trust Fund of Illinois gives no funding to the Technology Working Group.
Label: [1. 0.]


## Tokenize Data

In [12]:
BERT_ID = 'xlnet-base-cased'
tokenizer = XLNetTokenizer.from_pretrained(BERT_ID, do_lower_case=False)
bert_model = XLNetModel.from_pretrained(BERT_ID)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

In [13]:
premises, hypotheses = tokenize_data(tokenizer=tokenizer, premises=list(premises), hypotheses=list(hypotheses))

In [14]:
VOCAB_SIZE = tokenizer.vocab_size
print(f"Vocabulary size: {VOCAB_SIZE}")

Vocabulary size: 32000


### Example

In [15]:
#Premise
print(f"Sentence: {tokenizer.convert_ids_to_tokens(premises[index])}")
print(f"Tokens: {premises[index]}")

Sentence: ['▁With', '▁funding', '▁from', '▁the', '▁Lawyers', '▁Trust', '▁Fund', '▁of', '▁Illinois', ',', '▁the', '▁Technology', '▁Working', '▁Group', '▁and', '▁representatives', '▁from', '▁', 'CAR', 'PL', 'S', '▁', '(', 'the', '▁Chicago', '-', 'based', '▁hot', 'line', '▁and', '▁referral', '▁services', ')', ',', '▁Legal', '▁Assistance', '▁Foundation', '▁of', '▁Metropolitan', '▁Chicago', ',', '▁Prairie', '▁State', '▁Legal', '▁Services', '▁and', '▁Land', '▁of', '▁Lincoln', '▁Legal', '▁Assistance', '▁Foundation', '▁have', '▁formed', '▁a', '▁Best', '▁Practice', 's', '▁group', '.', '<sep>', '<cls>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<u

In [16]:
#Hypothesis
print(f"Sentence: {tokenizer.convert_ids_to_tokens(hypotheses[index])}")
print(f"Tokens: {hypotheses[index]}")

Sentence: ['▁The', '▁Lawyers', '▁Trust', '▁Fund', '▁of', '▁Illinois', '▁gives', '▁no', '▁funding', '▁to', '▁the', '▁Technology', '▁Working', '▁Group', '.', '<sep>', '<cls>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>']
Tokens: [   32 17497  5037  3185    20  3900  1849   116  2576    22    18  3506
  9418  1140     9     4     3     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0]


# Load Model

In [17]:
with open(model_arch_filepath, "r") as f:
    model_architecture = json.loads(f.read())

model = keras.models.model_from_json(model_architecture)
model.load_weights(model_weight_filepath)

# Make Predictions

In [None]:
predicted_logits = model.predict(x=[premises, hypotheses])
predicted_labels = np.argmax(predicted_logits, axis=1)

  1/211 [..............................] - ETA: 6:34

In [None]:
output_labels = pd.DataFrame(
    predicted_labels,
    columns=["prediction"]
)
output_labels.to_csv(labels_path, index=False)
output_labels

# Evaluation

In [None]:
#Evaluation Metrics
test_metrics = evaluate(true_labels=true, predicted_logits=np.array(predicted_logits))
test_metrics.to_csv("solution_B_metrics.csv", index=False)
test_metrics

In [None]:
#Confusion matrix
conf_mat = draw_confusion_matrix(true_labels=true, predicted_logits=np.array(predicted_logits))

In [None]:
#Show the most confused samples
df = most_confused_samples(true_logits=to_categorical(true), predicted_logits=np.array(predicted_logits), premises=premises_og, hypotheses=hypotheses_og, num=10)
df