In [None]:
from google.colab import drive
import os
import sys
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
model_arch_filepath = "drive/MyDrive/NLU Coursework/solution_B/models_B/combined.json" #Path to the model to be demoed
model_weight_filepath = "drive/MyDrive/NLU Coursework/solution_B/models_B/combined.hdf5" #Path to the model to be demoed

dataset_path = "drive/MyDrive/NLU Coursework/data/training_data/training_data/NLI/dev.csv" #Path to the dataset used for evaluation
labels_path = "drive/MyDrive/NLU Coursework/submissions/Group_17_B.csv" #Path to output the predicted labels

# Setup

## Imports

In [None]:
import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

In [None]:
import numpy as np
import typing
import json
from random import randint
import typing
import pandas as pd
import os
from dataclasses import dataclass
import random
from nltk.corpus import wordnet, stopwords
from itertools import chain
import nltk

In [None]:
from transformers import XLNetTokenizer, XLNetModel

## Functions

In [None]:
@dataclass(frozen=True)
class GeneralKeys:
    """
    Dataclass for general keys for the process
    """

    PREMISE_KEY: str = "Premise"
    HYPOTHESIS_KEY: str = "Hypothesis"
    LABEL_KEY: str = "Label"
    LOSS_KEY: str = "Loss"
    PREDICTED_KEY: str = "Predicted Label"
    TRUE_KEY: str = "True Label"

@dataclass(frozen=True)
class DatasetKeys:
    """
    Dataclass associated with keys for the data csvs
    """

    PREMISE_KEY: str = GeneralKeys.PREMISE_KEY.lower()
    HYPOTHESIS_KEY: str = GeneralKeys.HYPOTHESIS_KEY.lower()
    LABEL_KEY: str = GeneralKeys.LABEL_KEY.lower()

def load_data_csv(
    filepath: str,
) -> typing.Tuple[typing.List[str], typing.List[str], typing.List[int]]:
    """
    Will load in data from the csv filepath specified. Expects the string filepath to a csv file. Returns tuple of the premises, hypotheses and labels
    """
    dataset = pd.read_csv(filepath)
    premises = dataset[DatasetKeys.PREMISE_KEY].astype(str).tolist()
    hypotheses = dataset[DatasetKeys.HYPOTHESIS_KEY].astype(str).tolist()
    labels = dataset[DatasetKeys.LABEL_KEY].astype(str).tolist()
    return (premises, hypotheses, labels)

In [None]:
def pad_lists(lists, value, length: int):
  """
  Pads the lists with the value specified & returns this
  """
  return [(next + length * [value])[:length] for next in lists]

In [None]:
PADDING: str = "post"
def tokenize_data(tokenizer: XLNetTokenizer, premises: typing.List[str], hypotheses: typing.List[str]) ->typing.Tuple[np.array, np.array]:
  """
  Uses the input tokenizer to tokenizer the premises & hypotheses. Will then pad the sequences correctly, using the maxlen passed in
  """
  premises = [tokenizer.encode(premise, add_special_tokens=True) for premise in premises]
  hypotheses = [tokenizer.encode(hypothesis, add_special_tokens=True) for hypothesis in hypotheses]

  premises = pad_sequences(premises, maxlen=MAX_PREMISE_LENGTH, padding=PADDING, value=0)
  hypotheses = pad_sequences(hypotheses, maxlen=MAX_HYPOTHESIS_LENGTH, padding=PADDING, value=0)

  return (np.array(premises), np.array(hypotheses))

# Data Preprocessing

## Load Data

In [None]:
premises, hypotheses, labels = load_data_csv(filepath=dataset_path)

In [None]:
premises_og = premises
hypotheses_og = hypotheses

In [None]:
MAX_PREMISE_LENGTH: int = 110
MAX_HYPOTHESIS_LENGTH: int = 60

print(f"MAX_PREMISE_LENGTH: {MAX_PREMISE_LENGTH}")
print(f"MAX_HYPOTHESIS_LENGTH: {MAX_HYPOTHESIS_LENGTH}")

MAX_PREMISE_LENGTH: 110
MAX_HYPOTHESIS_LENGTH: 60


In [None]:
dev_labels = to_categorical(labels)

### Example

In [None]:
index = randint(0, len(premises))
print(f"Premise: {premises[index]}")
print(f"Hypothesis: {hypotheses[index]}")
print(f"Label: {labels[index]}")

Premise: Thus, the existence of supranational bodies with significant functions of governance is no longer the issue.
Hypothesis: The biggest issue we have is still the existence of supranational bodies.
Label: 0


## Tokenize Data

In [None]:
BERT_ID = 'xlnet-base-cased'
tokenizer = XLNetTokenizer.from_pretrained(BERT_ID, do_lower_case=False)
bert_model = XLNetModel.from_pretrained(BERT_ID)

In [None]:
premises, hypotheses = tokenize_data(tokenizer=tokenizer, premises=list(premises), hypotheses=list(hypotheses))

In [None]:
VOCAB_SIZE = tokenizer.vocab_size
print(f"Vocabulary size: {VOCAB_SIZE}")

Vocabulary size: 32000


### Example

In [None]:
#Premise
print(f"Sentence: {tokenizer.convert_ids_to_tokens(premises[index])}")
print(f"Tokens: {premises[index]}")

Sentence: ['▁Thus', ',', '▁the', '▁existence', '▁of', '▁supra', 'national', '▁bodies', '▁with', '▁significant', '▁functions', '▁of', '▁governance', '▁is', '▁no', '▁longer', '▁the', '▁issue', '.', '<sep>', '<cls>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<u

In [None]:
#Hypothesis
print(f"Sentence: {tokenizer.convert_ids_to_tokens(dev_hypotheses[index])}")
print(f"Tokens: {dev_hypotheses[index]}")

Sentence: ['▁National', '▁security', '▁was', '▁not', '▁in', '▁danger', '.', '<sep>', '<cls>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>']
Tokens: [ 360  470   30   50   25 3985    9    4    3    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0]


# Load Model

In [None]:
with open(model_arch_filepath, "r") as f:
    model_architecture = json.loads(f.read())

model = keras.models.model_from_json(model_architecture)
model.load_weights(model_weight_filepath)

# Demo

In [None]:
predicted_logits = model.predict(x=[premises, hypotheses])
predicted_labels = np.argmax(predicted_logits, axis=1)



In [None]:
output_labels = pd.DataFrame(
    predicted_labels,
    columns=["prediction"]
)
output_labels.to_csv(labels_path, index=False)
output_labels

Unnamed: 0,prediction
0,1
1,0
2,1
3,1
4,1
...,...
6732,0
6733,0
6734,1
6735,0
