In [None]:
# for using collab without an environment
# !pip install datasets
# !pip install wandb

Load the results of SentEval:

In [2]:
import json
result_path = './results/'
with open(result_path + 'baseline', 'r') as f:
    result_baseline = json.load(f)
with open(result_path + 'lstm', 'r') as f:
    result_lstm = json.load(f)
with open(result_path + 'bilstm', 'r') as f:
    result_bilstm = json.load(f)
with open(result_path + 'poollstm', 'r') as f:
    result_poollstm = json.load(f)

Loading pretrained models and evaluating them on eval and test dataset:

In [34]:
import torch

from model import USC
from dataset import get_data
from train import eval

# checkpoints need to be loaded with GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

checkpoint_path_base = './saves/baseline/checkpoint.pth'
checkpoint_path_lstm = './saves/lstm/checkpoint.pth'
checkpoint_path_bilstm = './saves/bilstm/checkpoint.pth'
checkpoint_path_poollstm = './saves/poollstm/checkpoint.pth'


train_loader, val_loader, test_loader, vectors, new_vocab = get_data(save_path=None, glove_path=None)
model_baseline = USC(encoder='baseline', vocab_size=len(vectors), sentence_length=150, vector_embeddings=vectors, embedding_dim=300,)
model_lstm = USC(encoder='lstm', vocab_size=len(vectors), sentence_length=150, vector_embeddings=vectors, embedding_dim=300,)
model_bilstm = USC(encoder='bilstm', vocab_size=len(vectors), sentence_length=150, vector_embeddings=vectors, embedding_dim=300,)
model_poollstm = USC(encoder='poollstm', vocab_size=len(vectors), sentence_length=150, vector_embeddings=vectors, embedding_dim=300,)

model_baseline = model_baseline.to(device)
model_lstm = model_lstm.to(device)
model_bilstm = model_bilstm.to(device)
model_poollstm = model_poollstm.to(device)

model_baseline.load_state_dict(torch.load(checkpoint_path_base), strict=False)
model_lstm.load_state_dict(torch.load(checkpoint_path_lstm), strict=False)
model_bilstm.load_state_dict(torch.load(checkpoint_path_bilstm), strict=False)
model_poollstm.load_state_dict(torch.load(checkpoint_path_poollstm), strict=False)

# just small eval (~2 min)
test_acc_base = eval(model_baseline, test_loader)
val_acc_base = eval(model_baseline, val_loader)

test_acc_lstm = eval(model_lstm, test_loader)
val_acc_lstm = eval(model_lstm, val_loader)

test_acc_bilstm = eval(model_bilstm, test_loader)
val_acc_bilstm = eval(model_bilstm, val_loader)

test_acc_poollstm = eval(model_poollstm, test_loader)
val_acc_poollstm = eval(model_poollstm, val_loader)

test_acc = [test_acc_base.cpu().item(), test_acc_lstm.cpu().item(), test_acc_bilstm.cpu().item(), test_acc_poollstm.cpu().item()]
val_acc = [val_acc_base.cpu().item(), val_acc_lstm.cpu().item(), val_acc_bilstm.cpu().item(), val_acc_poollstm.cpu().item()]

Loading data


Compute micro and macro metrics:

In [35]:
import numpy as np

datasets = [result_baseline, result_lstm, result_bilstm, result_poollstm]
micro = []
macro = []

for dataset in datasets:
  table_data = []
  for task in dataset:
      if 'devacc' in dataset[task]:
          table_data.append([dataset[task]['devacc'], dataset[task]['ndev']])
  table_data = np.array(table_data)
  micro.append(np.mean(table_data[:,0]))
  macro.append(np.sum(table_data[:,0]*table_data[:,1]) / np.sum(table_data[:,1]))

In [36]:
import pandas as pd

# recreate the table 3
dim = [1200, 2048, 4096, 4096]

data = {'dim': dim,
        'dev_acc': val_acc,
        'test_acc': test_acc,
        'micro': micro,
        'macro': macro}

df = pd.DataFrame(data=data, index=['Base', 'LSTM', 'BiLSTM', 'BiLSTM_Max'])
format_func = lambda x: f"{x:.2f}" if isinstance(x, (float)) else x
df = df.applymap(format_func)

print(df)

             dim dev_acc test_acc  micro  macro
Base        1200    0.54     0.53  74.86  76.44
LSTM        2048    0.40     0.40  59.25  62.17
BiLSTM      4096    0.62     0.62  74.17  75.80
BiLSTM_Max  4096    0.76     0.77  79.98  81.24


Create table showing performance on downstream tasks of SentEval:

In [37]:
datasets = [result_baseline, result_lstm, result_bilstm, result_poollstm]
tasks = set()
for dataset in datasets:
    tasks.update(dataset.keys())
tasks = sorted(tasks)
df = pd.DataFrame(columns=tasks)
for i, dataset in enumerate(datasets):
    row = {}
    for task in tasks:
        if task in dataset and 'devacc' in dataset[task]:
            row[task] = dataset[task]['devacc']
    df.loc[i] = row
df.drop('STS14', axis=1, inplace=True)
df = df.rename(columns={'SICKEntailment': 'SICKE'})
df = df.rename(index={0: 'Baseline', 1: 'LSTM', 2: 'BiLSTM', 3: 'BiLSTM_MAX'})
print(df)

               CR   MPQA     MR   MRPC  SICKE   SST2   SUBJ   TREC
Baseline    75.23  75.95  74.20  72.69   72.4  77.06  89.37  62.01
LSTM        65.22  77.62  56.62  67.54   56.4  57.22  70.46  22.93
BiLSTM      71.30  84.16  66.38  68.47   75.4  70.87  83.47  73.31
BiLSTM_MAX  79.91  85.52  75.52  72.67   81.4  80.39  91.30  73.13


Answering the assignment question and demonstrating models:

In [38]:
from utils import prepare_sentence_batch

model = model_poollstm

p1 = ['Two men sitting in the sun']
h1 = ['Nobody is sitting in the shade']

# Label - Neutral (likely predicts contradiction)

p2 = ['A man is walking a dog']
h2 = ['No cat is outside']

# Label - Neutral (likely predicts contradiction)

p1, p1_len = prepare_sentence_batch(p1, new_vocab, 150)
h1, h1_len = prepare_sentence_batch(h1, new_vocab, 150)
p1 = p1.to(device)
h1 = h1.to(device)
p1_len = p1_len.squeeze(1).to(device)
h1_len = h1_len.squeeze(1).to(device)

prediction1 = torch.argmax(model(p1, p1_len, h1, h1_len), dim=1)

p2, p2_len = prepare_sentence_batch(p2, new_vocab, 150)
h2, h2_len = prepare_sentence_batch(h2, new_vocab, 150)
p2 = p2.to(device)
h2 = h2.to(device)
p2_len = p2_len.squeeze(1).to(device)
h2_len = h2_len.squeeze(1).to(device)

prediction2 = torch.argmax(model(p2, p2_len, h2, h2_len), dim=1)

print('Labels: 0: entailment, 1: neutral, 2: contradiction')
print(f'Prediction for first example is: {prediction1}')
print(f'Prediction for second example is: {prediction2}')

Labels: 0: entailment, 1: neutral, 2: contradiction
Prediction for first example is: tensor([1], device='cuda:0')
Prediction for first example is: tensor([1], device='cuda:0')


For this tricky example our model is acctually correct. Next we will test with a weaker model:

In [39]:
model = model_baseline

p1 = ['Two men sitting in the sun']
h1 = ['Nobody is sitting in the shade']

# Label - Neutral (likely predicts contradiction)

p2 = ['A man is walking a dog']
h2 = ['No cat is outside']

# Label - Neutral (likely predicts contradiction)

p1, p1_len = prepare_sentence_batch(p1, new_vocab, 150)
h1, h1_len = prepare_sentence_batch(h1, new_vocab, 150)
p1 = p1.to(device)
h1 = h1.to(device)
p1_len = p1_len.squeeze(1).to(device)
h1_len = h1_len.squeeze(1).to(device)

prediction1 = torch.argmax(model(p1, p1_len, h1, h1_len), dim=1)

p2, p2_len = prepare_sentence_batch(p2, new_vocab, 150)
h2, h2_len = prepare_sentence_batch(h2, new_vocab, 150)
p2 = p2.to(device)
h2 = h2.to(device)
p2_len = p2_len.squeeze(1).to(device)
h2_len = h2_len.squeeze(1).to(device)

prediction2 = torch.argmax(model(p2, p2_len, h2, h2_len), dim=1)

print('Labels: 0: entailment, 1: neutral, 2: contradiction')
print(f'Prediction for first example is: {prediction1}')
print(f'Prediction for second example is: {prediction2}')

Labels: 0: entailment, 1: neutral, 2: contradiction
Prediction for first example is: tensor([1], device='cuda:0')
Prediction for first example is: tensor([2], device='cuda:0')


This time our model is wrong for the second example. It predicts contradiction instead of neutral. This is understandable as it makes sense as the embeddings for dog and cat are likely to be close. And one is outside while the hypothesis states that the other (which the model assumes to be almost the same) is not. The same is likely for the first example but here our model is correct.


# Research question


I want to know if this model can handle race appropiatley with two short examples:

In [40]:
model = model_poollstm

p1 = ['No man is outside']
h1 = ['Two afro-americans are sitting outside']

# Label - contradiction

p2 = ['All man die']
h2 = ['White people are immortal']

# Label - contradiction

p1, p1_len = prepare_sentence_batch(p1, new_vocab, 150)
h1, h1_len = prepare_sentence_batch(h1, new_vocab, 150)
p1 = p1.to(device)
h1 = h1.to(device)
p1_len = p1_len.squeeze(1).to(device)
h1_len = h1_len.squeeze(1).to(device)

prediction1 = torch.argmax(model(p1, p1_len, h1, h1_len), dim=1)

p2, p2_len = prepare_sentence_batch(p2, new_vocab, 150)
h2, h2_len = prepare_sentence_batch(h2, new_vocab, 150)
p2 = p2.to(device)
h2 = h2.to(device)
p2_len = p2_len.squeeze(1).to(device)
h2_len = h2_len.squeeze(1).to(device)

prediction2 = torch.argmax(model(p2, p2_len, h2, h2_len), dim=1)

print('Labels: 0: entailment, 1: neutral, 2: contradiction')
print(f'Prediction for first example is: {prediction1}')
print(f'Prediction for second example is: {prediction2}')

Labels: 0: entailment, 1: neutral, 2: contradiction
Prediction for first example is: tensor([1], device='cuda:0')
Prediction for first example is: tensor([1], device='cuda:0')


Interestingly enough it is wrong on both accounts. I will pose the same question without race differences again:

In [41]:
model = model_poollstm

p1 = ['No man is outside']
h1 = ['Two persons are sitting outside']

# Label - contradiction

p2 = ['All man die']
h2 = ['People are immortal']

# Label - contradiction

p1, p1_len = prepare_sentence_batch(p1, new_vocab, 150)
h1, h1_len = prepare_sentence_batch(h1, new_vocab, 150)
p1 = p1.to(device)
h1 = h1.to(device)
p1_len = p1_len.squeeze(1).to(device)
h1_len = h1_len.squeeze(1).to(device)

prediction1 = torch.argmax(model(p1, p1_len, h1, h1_len), dim=1)

p2, p2_len = prepare_sentence_batch(p2, new_vocab, 150)
h2, h2_len = prepare_sentence_batch(h2, new_vocab, 150)
p2 = p2.to(device)
h2 = h2.to(device)
p2_len = p2_len.squeeze(1).to(device)
h2_len = h2_len.squeeze(1).to(device)

prediction2 = torch.argmax(model(p2, p2_len, h2, h2_len), dim=1)

print('Labels: 0: entailment, 1: neutral, 2: contradiction')
print(f'Prediction for first example is: {prediction1}')
print(f'Prediction for second example is: {prediction2}')

Labels: 0: entailment, 1: neutral, 2: contradiction
Prediction for first example is: tensor([1], device='cuda:0')
Prediction for first example is: tensor([1], device='cuda:0')


The same result. This might mean that my model can not pick up the contradiction. Rather than having to do anything with race.

I will try again with more continous examples:

In [45]:
model = model_poollstm

p1 = ['Two black man walk outside']
h1 = ['The man talk deeply']

# Label - entailment

p2 = ['Two white man walk outside']
h2 = ['The man talk deeply']

# Label - entailment

p1, p1_len = prepare_sentence_batch(p1, new_vocab, 150)
h1, h1_len = prepare_sentence_batch(h1, new_vocab, 150)
p1 = p1.to(device)
h1 = h1.to(device)
p1_len = p1_len.squeeze(1).to(device)
h1_len = h1_len.squeeze(1).to(device)

prediction1 = torch.argmax(model(p1, p1_len, h1, h1_len), dim=1)

p2, p2_len = prepare_sentence_batch(p2, new_vocab, 150)
h2, h2_len = prepare_sentence_batch(h2, new_vocab, 150)
p2 = p2.to(device)
h2 = h2.to(device)
p2_len = p2_len.squeeze(1).to(device)
h2_len = h2_len.squeeze(1).to(device)

prediction2 = torch.argmax(model(p2, p2_len, h2, h2_len), dim=1)

print('Labels: 0: entailment, 1: neutral, 2: contradiction')
print(f'Prediction for first example is: {prediction1}')
print(f'Prediction for second example is: {prediction2}')

Labels: 0: entailment, 1: neutral, 2: contradiction
Prediction for first example is: tensor([1], device='cuda:0')
Prediction for first example is: tensor([1], device='cuda:0')


The answer is wrong on both accounts

Again with another model:

In [44]:
model = model_baseline

p1 = ['Two black man walk outside']
h1 = ['The man talk deeply']

# Label - contradiction

p2 = ['Two white man walk outside']
h2 = ['The man talk deeply']

# Label - contradiction

p1, p1_len = prepare_sentence_batch(p1, new_vocab, 150)
h1, h1_len = prepare_sentence_batch(h1, new_vocab, 150)
p1 = p1.to(device)
h1 = h1.to(device)
p1_len = p1_len.squeeze(1).to(device)
h1_len = h1_len.squeeze(1).to(device)

prediction1 = torch.argmax(model(p1, p1_len, h1, h1_len), dim=1)

p2, p2_len = prepare_sentence_batch(p2, new_vocab, 150)
h2, h2_len = prepare_sentence_batch(h2, new_vocab, 150)
p2 = p2.to(device)
h2 = h2.to(device)
p2_len = p2_len.squeeze(1).to(device)
h2_len = h2_len.squeeze(1).to(device)

prediction2 = torch.argmax(model(p2, p2_len, h2, h2_len), dim=1)

print('Labels: 0: entailment, 1: neutral, 2: contradiction')
print(f'Prediction for first example is: {prediction1}')
print(f'Prediction for second example is: {prediction2}')

Labels: 0: entailment, 1: neutral, 2: contradiction
Prediction for first example is: tensor([0], device='cuda:0')
Prediction for first example is: tensor([0], device='cuda:0')


The answer is correct for the baseline model. This is interesting as it means that although the poollstm model performs better it can also be worse. I found no bias in race but further study is needed.