In [1]:
import os
import time
import io
import string
import pickle as pkl
from collections import Counter
import tqdm
import numpy as np
import pandas as pd
import spacy
import torch

import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

import imp
import utils
import data
import models
import train_helpers

print(imp.reload(utils))
print(imp.reload(data))
print(imp.reload(models))
print(imp.reload(train_helpers))

<module 'utils' from '/Users/diogomesquita/Documents/nyu/2y/nlp/homeworks/hw2/utils.py'>
<module 'data' from '/Users/diogomesquita/Documents/nyu/2y/nlp/homeworks/hw2/data.py'>
<module 'models' from '/Users/diogomesquita/Documents/nyu/2y/nlp/homeworks/hw2/models.py'>
<module 'train_helpers' from '/Users/diogomesquita/Documents/nyu/2y/nlp/homeworks/hw2/train_helpers.py'>


## Data Preprocessing

In [2]:
# PARAMS:
VOCAB_SIZE = 15000
PAD = '<pad>'
UNK = '<unk>'
emb_dim = 300
BATCH_SIZE = 32
data_dir = "hw2_data"
vocab_dir = "vocab"

In [3]:
RUN_EVERYTHING = False

In [4]:
train_data_raw = pd.read_csv(os.path.join(data_dir, "snli_train.tsv"), sep='\t')
val_data_raw = pd.read_csv(os.path.join(data_dir, "snli_val.tsv"), sep='\t')

In [5]:
labels_ix = {
    'contradiction': 0,
    'entailment': 1,
    'neutral': 2
}

def label_to_ix(df):
    return df['label'].apply(lambda x: labels_ix[x])

train_target = label_to_ix(train_data_raw)
val_target = label_to_ix(val_data_raw)

utils.save_pkl_data(train_target, 'snli_train_target.p')
utils.save_pkl_data(val_target, 'snli_val_target.p')

train_data = train_data_raw.loc[:, ['sentence1', 'sentence2']]
val_data = val_data_raw.loc[:, ['sentence1', 'sentence2']]

train_data.columns = ['premise', 'hypothesis']
val_data.columns = ['premise', 'hypothesis']

In [6]:
train_data_raw.head()

Unnamed: 0,sentence1,sentence2,label
0,A young girl in a pink shirt sitting on a dock...,A young girl watching the sunset over the water .,neutral
1,A woman is smiling while the man next to her i...,Two people are next to each other .,entailment
2,"Across the river , you can see a large building .",The large building is full of apartments and t...,neutral
3,a man in white shorts and a black shirt is par...,A man is riding a jetski on the ocean .,contradiction
4,Four black dogs run together on bright green g...,Four dogs are preparing to be launched into sp...,contradiction


In [7]:
train_data.head()

Unnamed: 0,premise,hypothesis
0,A young girl in a pink shirt sitting on a dock...,A young girl watching the sunset over the water .
1,A woman is smiling while the man next to her i...,Two people are next to each other .
2,"Across the river , you can see a large building .",The large building is full of apartments and t...
3,a man in white shorts and a black shirt is par...,A man is riding a jetski on the ocean .
4,Four black dogs run together on bright green g...,Four dogs are preparing to be launched into sp...


In [8]:
train_target[:5]

0    2
1    1
2    2
3    0
4    0
Name: label, dtype: int64

In [9]:
n_train = len(train_data)
n_val = len(val_data)
print("#examples train: ", n_train)
print("#examples val: ", n_val)

#examples train:  100000
#examples val:  1000


In [10]:
np.random.seed(4364)
rx = np.random.randint(0, n_train, 1)[0]
print("ix: ", rx)
print("Premise:    ", train_data.iloc[rx,0])
print("Hypothesis: ", train_data.iloc[rx,1])
print("Label:      ", train_target[rx])

ix:  21110
Premise:     Six men shielding their eyes from the sun
Hypothesis:  People shield their eyes from the bright light .
Label:       2


In [11]:
def validate_data(df):
    for ix in range(len(df)):
        if type(df.loc[ix, 'premise']) != str:
            print("Bad premise at index: ", ix)
        if type(df.loc[ix, 'hypothesis']) != str:
            print("Bad hypothesis at index: ", ix)
            
validate_data(train_data)
validate_data(val_data)

In [12]:
tokenizer = spacy.load("en_core_web_sm")

def remove_punctuation(tokens):
    return [token.text.lower() for token in tokens if token.text not in string.punctuation]
    
def tokenize(data):
    all_tokens = []
    data_tok = []
    for ix in tqdm.tqdm_notebook(range(len(data))):
        premise = remove_punctuation(tokenizer(data.loc[ix, 'premise']))
        hypothesis = remove_punctuation(tokenizer(data.loc[ix, 'hypothesis']))
        all_tokens += premise + hypothesis
        data_tok.append([premise, hypothesis])
    return data_tok, all_tokens

f_train = 'snli_train_tok.p'
f_val = 'snli_val_tok.p'
if RUN_EVERYTHING or not os.path.isfile(os.path.join(data_dir, f_train)):
    print('running')
    train_data_tok, all_tokens = tokenize(train_data)
    val_data_tok, _ = tokenize(val_data)
    utils.save_pkl_data(train_data_tok, f_train)
    utils.save_pkl_data(all_tokens, 'snli_all_tokens.p')
    utils.save_pkl_data(val_data_tok, f_val)
elif os.path.isfile(os.path.join(data_dir, f_train)):
    train_data_tok = utils.load_pkl_data(f_train)
    all_tokens = utils.load_pkl_data('snli_all_tokens.p')
    val_data_tok = utils.load_pkl_data(f_val)


In [13]:
print("number tokens: ", len(all_tokens))
print("number unique tokens: ", len(set(all_tokens)))

number tokens:  2038281
number unique tokens:  19643


In [14]:
def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = np.array(tokens[1:], dtype=np.float32)
    return data

def build_vocab(all_tokens, tok2vec):
    counts = Counter(all_tokens).most_common()
    ind2tok = [PAD, UNK] + [token for token, count in counts if token in tok2vec]
    tok2ind = {tok: ind for ind, tok in enumerate(ind2tok)}
    ind2vec = np.zeros((len(ind2tok), emb_dim), dtype=np.float32)
    ind2vec[tok2ind[UNK], :] = np.random.uniform(low=-1, size=(emb_dim,))
    for ix, tok in tqdm.tqdm_notebook(enumerate(ind2tok)):
        if ix < 2:
            continue
        ind2vec[ix, :] = tok2vec[tok]
    return ind2tok, tok2ind, torch.from_numpy(ind2vec)
    
if RUN_EVERYTHING or not os.path.isfile(os.path.join(vocab_dir, 'ind2vec.p')):
    print('running')
    tok2vec = load_vectors('wiki-news-300d-1M.vec')
    ind2tok, tok2ind, ind2vec = build_vocab(all_tokens, tok2vec)
    utils.save_pkl_data(ind2tok, 'ind2tok.p', data_dir='vocab')
    utils.save_pkl_data(tok2ind, 'tok2ind.p', data_dir='vocab')
    utils.save_pkl_data(ind2vec, 'ind2vec.p', data_dir='vocab')
else:
    ind2tok = utils.load_pkl_data('ind2tok.p', data_dir='vocab')
    tok2ind = utils.load_pkl_data('tok2ind.p', data_dir='vocab')
    ind2vec = utils.load_pkl_data('ind2vec.p', data_dir='vocab')

In [15]:
print("Size of lookup table: ", ind2vec.shape)

Size of lookup table:  torch.Size([18107, 300])


In [16]:
def data_indices(data):
    data_ind = []
    for ix, (premise, hypothesis) in enumerate(data):
        if ix % 5000 == 4999:
            print(f"[{ix}/100000]")
        premise_ind = [tok2ind[tok] if tok in tok2ind else tok2ind[UNK] for tok in premise]
        hypothesis_ind = [tok2ind[tok] if tok in tok2ind else tok2ind[UNK] for tok in hypothesis]
        data_ind.append([premise_ind, hypothesis_ind])
    return data_ind

if RUN_EVERYTHING or not os.path.isfile(os.path.join(data_dir, 'snli_train_ind.p')):
    print('running')
    train_ind = data_indices(train_data_tok)
    val_ind = data_indices(val_data_tok)
    utils.save_pkl_data(train_ind, 'snli_train_ind.p')
    utils.save_pkl_data(val_ind, 'snli_val_ind.p')
else:
    train_ind = utils.load_pkl_data('snli_train_ind.p')
    val_ind = utils.load_pkl_data('snli_val_ind.p')

In [17]:
for ix in range(len(train_ind)):
    assert(len(train_ind[ix][0]) == len(train_data_tok[ix][0]))
    assert(len(train_ind[ix][1]) == len(train_data_tok[ix][1]))

In [18]:
SMALL = True
if SMALL:
    train_dataset = data.SNLI_Dataset(train_ind[:5*32], train_target)
else:
    train_dataset = data.SNLI_Dataset(train_ind, train_target)
val_dataset = data.SNLI_Dataset(val_ind, val_target)

In [None]:
BATCH_SIZE = 32
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=data.collate_fn)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=data.collate_fn)

# model
ind2vec = utils.load_pkl_data('ind2vec.p', data_dir='vocab')
embed_size = 300
premise_hidden_size = 300
hypo_hidden_size = 300
linear_hidden_size = 100
num_layers = 1
bidirectional = False
snli_model_rnn = models.SNLI_Model_RNN(ind2vec,
                                       embed_size,
                                       premise_hidden_size,
                                       hypo_hidden_size,
                                       linear_hidden_size,
                                       num_layers,
                                       bidirectional)
# optim
lr = 0.001
optimizer = torch.optim.Adam([param for param in snli_model_rnn.parameters() if param.requires_grad], lr=lr)

loss_fn = nn.CrossEntropyLoss()

In [21]:
batch_params_key = ['premise', 'hypo', 'premise_len', 'hypo_len', 'targets']
train_helper = train_helpers.TrainHelper(snli_model_rnn_pack, loss_fn, optimizer, batch_params_key, 'rnn_model')
out = train_helper.train_loop(train_loader, val_loader, 2, save_freq=2)

evaluate time:
		-elapsed: 1.2391560077667236
Epoch [1/2]; Batch [1/5]:
	Train loss  : 1.100
	Val loss    : 1.099
	Train acc   : 0.312
	Val acc     : 33.9
	Best val acc: 33.9
	Best model  : epoch_1_batch_1.pt
	Elapsed     : 1.322725534439087 s

evaluate time:
		-elapsed: 1.1616828441619873
Epoch [1/2]; Batch [3/5]:
	Train loss  : 1.100
	Val loss    : 1.098
	Train acc   : 0.328
	Val acc     : 34.0
	Best val acc: 34.0
	Best model  : epoch_1_batch_3.pt
	Elapsed     : 1.2340919971466064 s

evaluate time:
		-elapsed: 1.1452140808105469
Epoch [2/2]; Batch [1/5]:
	Train loss  : 1.097
	Val loss    : 1.099
	Train acc   : 0.344
	Val acc     : 33.6
	Best val acc: 34.0
	Best model  : epoch_1_batch_3.pt
	Elapsed     : 1.233088493347168 s

evaluate time:
		-elapsed: 1.3716437816619873
Epoch [2/2]; Batch [3/5]:
	Train loss  : 1.101
	Val loss    : 1.098
	Train acc   : 0.312
	Val acc     : 34.0
	Best val acc: 34.0
	Best model  : epoch_1_batch_3.pt
	Elapsed     : 1.410137414932251 s



In [68]:
B = 2
S = 3
C = 4
a = torch.rand(B, S, C)
a

tensor([[[ 0.3624,  0.5362,  0.2572,  0.1362],
         [ 0.1073,  0.2563,  0.9236,  0.1551],
         [ 0.5120,  0.8264,  0.9184,  0.9055]],

        [[ 0.9326,  0.7553,  0.1436,  0.5508],
         [ 0.0753,  0.2618,  0.5464,  0.6401],
         [ 0.0592,  0.4117,  0.6657,  0.4487]]])

In [69]:
print(a.transpose(1,2))

tensor([[[ 0.3624,  0.1073,  0.5120],
         [ 0.5362,  0.2563,  0.8264],
         [ 0.2572,  0.9236,  0.9184],
         [ 0.1362,  0.1551,  0.9055]],

        [[ 0.9326,  0.0753,  0.0592],
         [ 0.7553,  0.2618,  0.4117],
         [ 0.1436,  0.5464,  0.6657],
         [ 0.5508,  0.6401,  0.4487]]])


In [70]:
print(a.transpose(2,1))

tensor([[[ 0.3624,  0.1073,  0.5120],
         [ 0.5362,  0.2563,  0.8264],
         [ 0.2572,  0.9236,  0.9184],
         [ 0.1362,  0.1551,  0.9055]],

        [[ 0.9326,  0.0753,  0.0592],
         [ 0.7553,  0.2618,  0.4117],
         [ 0.1436,  0.5464,  0.6657],
         [ 0.5508,  0.6401,  0.4487]]])


In [48]:
net1=None
net2=None

In [49]:
print(net1)

None


In [64]:
hidden_size = 500
for epoch in range(1, 7):
    for i in range(499, 3000, 500):
        print('model: ' + f'epoch_{epoch}_batch_{i}.pt')
        net = models.SNLI_Model_RNN(ind2vec, 300, hidden_size, hidden_size, 80, 1, True)
        net.load_state_dict(torch.load(f'rnn_model_500_cat/epoch_{epoch}_batch_{i}.pt'))
        helper = train_helpers.TrainHelper(net, loss_fn, optimizer, batch_params_key)
        print("\tAcc: ", helper.evaluate(val_loader, accuracy=True))
        print("\tLoss: ", helper.evaluate(val_loader, accuracy=False))
        print()

model: epoch_1_batch_499.pt
	Acc:  35.2
	Loss:  1.0965064764022827

model: epoch_1_batch_999.pt
	Acc:  39.1
	Loss:  1.09356689453125

model: epoch_1_batch_1499.pt
	Acc:  46.3
	Loss:  1.0887209177017212

model: epoch_1_batch_1999.pt
	Acc:  44.6
	Loss:  1.0817081928253174

model: epoch_1_batch_2499.pt
	Acc:  46.4
	Loss:  1.0731196403503418

model: epoch_1_batch_2999.pt
	Acc:  48.1
	Loss:  1.064650058746338

model: epoch_2_batch_499.pt
	Acc:  48.0
	Loss:  1.049410343170166

model: epoch_2_batch_999.pt
	Acc:  47.5
	Loss:  1.0386450290679932

model: epoch_2_batch_1499.pt
	Acc:  48.7
	Loss:  1.0296701192855835

model: epoch_2_batch_1999.pt
	Acc:  50.5
	Loss:  1.017623782157898

model: epoch_2_batch_2499.pt
	Acc:  49.0
	Loss:  1.0154445171356201

model: epoch_2_batch_2999.pt
	Acc:  51.2
	Loss:  1.0078098773956299

model: epoch_3_batch_499.pt
	Acc:  49.6
	Loss:  1.0009291172027588

model: epoch_3_batch_999.pt
	Acc:  51.4
	Loss:  1.0061918497085571

model: epoch_3_batch_1499.pt
	Acc:  50.9
	Los

KeyboardInterrupt: 

Acc:  35.7
Loss:  35.5


In [60]:
helper.model = net2
print("Acc: ", helper.evaluate(val_loader, accuracy=True))
print("Loss: ", helper.evaluate(val_loader))

Acc:  51.6
Loss:  0.983454167842865
