# Model demo



In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from preprocess import *
from spacy.lang.en import English
import os
import numpy as np
import statistics
from utils import save_file, load_file, load_model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


The following four models can be tested by changing MODEL_FLAG:

- 'bow': for the bag of words model, where glove embeddings of the sequence are averaged to obtain sentence representations.

- 'lstm': for the LSTM model where the final hidden state is used as a sentence representation.

- 'lstm_max': for the LSTM model with max pooling over the hidden states of the sequence (for extra research question). 

- 'bilstm': for the BiLSTM model where the concatenation of the final hidden states (of the forward and backward LSTM) is used as sentence representation.

- 'bilstm_max': for the BiLSTM model with max pooling over the concatenated hidden states (of forward and backward LSTM) of the sequence. 

In [2]:
MODEL_FLAG = 'bilstm_max'
state_file_path = f'weights/{MODEL_FLAG}/{MODEL_FLAG}_best.pth'

labels = ['neutral', 'entailment', 'contradiction']
vocab_file = 'NLI_vocab.pkl'
folder = 'saved_files'
if os.path.exists(f'{folder}/{vocab_file}'):
    print('loading vocab from file')
    vocab = load_file(vocab_file)
else:
    print('creating vocab with training set')
    train_split = preprocess(split='train')
    vocab = create_vocab(train_split)
    save_file(vocab, vocab_file)

vocab_size = len(vocab.mapping)
# embeddings need to be saved to file once. Not uploaded because of file size
embeddings = align_vocab_with_glove(vocab)
model = load_model(embeddings, labels, vocab_size, device, MODEL_FLAG, state_file_path)
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"{MODEL_FLAG} has {total_params} trainable parameters")
# lstm: 23710211, bilstm 47155715 and bow: 879107

loading vocab from file
saved_files/NLI_vocab.pkl
loaded embeddings from file
loading bilstm_max
bilstm_max has 47155715 trainable parameters


In [3]:
def transform_sentence(sent, vocab, tokenize=True):
    """ 
    Input: string and model vocab 
    Maps tokens to ids after optionally tokenizing and lowering.
    Returns the id's from vocab and sentence length"""
    if tokenize:
        nlp = English()
        tokenizer = nlp.tokenizer
        sent = list(tokenizer(sent.lower()))
    sent_ids = []
    for token in sent:
        sent_ids.append(vocab.mapping.get(token.text, 0))
    return sent_ids, [len(sent_ids)]
# transform_sentence('This function maps tokens to ids and returns length', vocab)

def make_prediction(sent1, sent2, vocab, model, model_flag, printing=True, tokenize=True):
    """
    Input: sent1 and sent2 as strings, vocab, loaded model and corresponding model flag.
    Uses chosen model to make prediction about relation between sentences. 
    Set printing to false for experiment with whole test set. 
    Set tokenize to false if sentences are already tokenized
    Returns: predicted relation as string, numeric label and length of sentences
    """
    if printing:
        print(f'premise: {sent1}')
        print(f'hypothesis: {sent2}')
        print("\n")
    sent_ids1, length1 = transform_sentence(sent1, vocab, tokenize)
    sent_ids2, length2 = transform_sentence(sent2, vocab, tokenize)
    sent1 = torch.tensor([sent_ids1])
    sent2 = torch.tensor([sent_ids2])
    if model_flag == 'bow':
        logits = model(sent1, sent2)
    elif model_flag == 'lstm' or model_flag == 'bilstm' or model_flag == 'bilstm_max' or model_flag == 'lstm_max':
        logits = model(sent1, length1, sent2, length2)

    probabilities = F.softmax(logits, dim=1)
    predicted_label = torch.argmax(probabilities, dim=1)
    labels = ['neutral', 'entailment', 'contradiction']
    # output tensor to readable probabilities
    prob_list = [round(prob, 3) for prob in probabilities.tolist()[0]]
    if printing: 
        for label, prob in zip(labels, prob_list):
            print(f'predicted {label} with {prob} probability')
        print('\n')
        print(f'Therefore, predicted relation: {labels[predicted_label]}')
    return labels[predicted_label], predicted_label, length1, length2

premise = 'a woman with a green headscarf , blue shirt and a very big grin.'
hypothesis = 'the woman is young.'
relation, numeric, _, _ = make_prediction(premise, hypothesis, vocab, model, MODEL_FLAG)

premise: a woman with a green headscarf , blue shirt and a very big grin.
hypothesis: the woman is young.


predicted neutral with 0.425 probability
predicted entailment with 0.568 probability
predicted contradiction with 0.007 probability


Therefore, predicted relation: entailment


In [4]:
def extract_examples(vocab, model, MODEL_FLAG, num_examples=1):
  """
  Extract examples with predictions on test set for analysis
  One example consists of three related sentences with the three labels
  """
  start = 3
  end = 3 + num_examples * 3
  test_split = preprocess(split='test')
  for example in test_split[start:end]:
    prediction, _, _, _ = make_prediction(example['sentence_1'], example['sentence_2'], vocab, model, MODEL_FLAG, printing=True, tokenize=False)
    premise = [token.text for token in example['sentence_1']]
    hypothesis = [token.text for token in example['sentence_2']]
    premise = ' '.join(premise)
    hypothesis = ' '.join(hypothesis)
    # print(f'premise: {premise}')
    # print(f'hypothesis: {hypothesis}')
    # print(f'{MODEL_FLAG} predicted: {prediction}')
    print('actual label:', example['gold_label'])
    print("Correct prediction: ", prediction == example['gold_label'])
    print('================================')
  return 
extract_examples(vocab, model, MODEL_FLAG)

done reading test json
premise: [a, woman, with, a, green, headscarf, ,, blue, shirt, and, a, very, big, grin, .]
hypothesis: [the, woman, is, young, .]


predicted neutral with 0.425 probability
predicted entailment with 0.568 probability
predicted contradiction with 0.007 probability


Therefore, predicted relation: entailment
actual label: neutral
Correct prediction:  False
premise: [a, woman, with, a, green, headscarf, ,, blue, shirt, and, a, very, big, grin, .]
hypothesis: [the, woman, is, very, happy, .]


predicted neutral with 0.249 probability
predicted entailment with 0.751 probability
predicted contradiction with 0.001 probability


Therefore, predicted relation: entailment
actual label: entailment
Correct prediction:  True
premise: [a, woman, with, a, green, headscarf, ,, blue, shirt, and, a, very, big, grin, .]
hypothesis: [the, woman, has, been, shot, .]


predicted neutral with 0.57 probability
predicted entailment with 0.408 probability
predicted contradiction with 0.02

In [10]:
def sent_length_performance(vocab, model, MODEL_FLAG, printing=True):
  """ calculates the performance of the model for different total sentence length on test set
  Short sentences are below mean - 1 standard deviation, while long sentences are above mean + 1 SD """
  test_split = preprocess(split='test')
  lengths = []
  short_correct, short_total, medium_correct, medium_total, long_correct, long_total = 0, 0, 0, 0, 0, 0
  for example in test_split:
    
    prediction, _, length1, length2 = make_prediction(example['sentence_1'], example['sentence_2'], vocab, model, MODEL_FLAG, printing=False, tokenize=False)
    combined_length = length1[0] + length2[0]
    lengths.append(combined_length)
    if combined_length <= 16:
      short_correct += int(prediction == example['gold_label'])
      short_total += 1
    elif combined_length >= 31:
       long_correct += int(prediction == example['gold_label'])
       long_total += 1
    else:
      medium_correct += int(prediction == example['gold_label'])
      medium_total += 1
  mean = statistics.mean(lengths)
  median = statistics.median(lengths)
  stdev = statistics.stdev(lengths)
  min_value = min(lengths)
  max_value = max(lengths)
  if printing:
    print('Combined sentence length stats of test set')
    print("mean:", round(mean, 2))
    print("standard deviation", round(stdev, 2))
    print('minimal length', min_value)
    print('maximum length', max_value)
    print(f'{MODEL_FLAG} accuracy on short length input: {round(short_correct / float(short_total) * 100, 2)} %')
    print(f'{MODEL_FLAG} accuracy on medium length input: {round(medium_correct / float(medium_total) * 100, 2)} %')
    print(f'{MODEL_FLAG} accuracy on long length input: {round(long_correct / float(long_total) * 100, 2)} %')

  return short_correct, short_total, short_correct / float(short_total), medium_correct, medium_total, medium_correct / float(medium_total), long_correct, long_total, long_correct / float(long_total),
  
sent_length_performance(vocab, model, MODEL_FLAG)

done reading test json
