In [6]:
import os
import io
import string
import pickle as pkl
from collections import Counter
import tqdm
import numpy as np
import pandas as pd
import spacy
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

import imp
import utils
import data
print(imp.reload(utils))
print(imp.reload(data))

<module 'utils' from '/Users/diogomesquita/Documents/nyu/2y/nlp/homeworks/hw2/utils.py'>
<module 'data' from '/Users/diogomesquita/Documents/nyu/2y/nlp/homeworks/hw2/data.py'>


## Data Preprocessing

In [54]:
# PARAMS:
VOCAB_SIZE = 15000
PAD = '<pad>'
UNK = '<unk>'
emb_dim = 300
BATCH_SIZE = 32
data_dir = "hw2_data"

In [3]:
train_data_raw = pd.read_csv(os.path.join(data_dir, "snli_train.tsv"), sep='\t')
val_data_raw = pd.read_csv(os.path.join(data_dir, "snli_val.tsv"), sep='\t')

In [4]:
labels_ix = {
    'contradiction': 0,
    'entailment': 1,
    'neutral': 2
}

def label_to_ix(df):
    return df['label'].apply(lambda x: labels_ix[x])

train_target = label_to_ix(train_data_raw)
val_target = label_to_ix(val_data_raw)

train_data = train_data_raw.loc[:, ['sentence1', 'sentence2']]
val_data = val_data_raw.loc[:, ['sentence1', 'sentence2']]

train_data.columns = ['premise', 'hypothesis']
val_data.columns = ['premise', 'hypothesis']

In [5]:
train_data_raw.head()

Unnamed: 0,sentence1,sentence2,label
0,A young girl in a pink shirt sitting on a dock...,A young girl watching the sunset over the water .,neutral
1,A woman is smiling while the man next to her i...,Two people are next to each other .,entailment
2,"Across the river , you can see a large building .",The large building is full of apartments and t...,neutral
3,a man in white shorts and a black shirt is par...,A man is riding a jetski on the ocean .,contradiction
4,Four black dogs run together on bright green g...,Four dogs are preparing to be launched into sp...,contradiction


In [6]:
train_data.head()

Unnamed: 0,premise,hypothesis
0,A young girl in a pink shirt sitting on a dock...,A young girl watching the sunset over the water .
1,A woman is smiling while the man next to her i...,Two people are next to each other .
2,"Across the river , you can see a large building .",The large building is full of apartments and t...
3,a man in white shorts and a black shirt is par...,A man is riding a jetski on the ocean .
4,Four black dogs run together on bright green g...,Four dogs are preparing to be launched into sp...


In [7]:
train_target[:5]

0    2
1    1
2    2
3    0
4    0
Name: label, dtype: int64

In [8]:
n_train = len(train_data)
n_val = len(val_data)
print("#examples train: ", n_train)
print("#examples val: ", n_val)

#examples train:  100000
#examples val:  1000


In [9]:
np.random.seed(4364)
rx = np.random.randint(0, n_train, 1)[0]
print("ix: ", rx)
print("Premise:    ", train_data.iloc[rx,0])
print("Hypothesis: ", train_data.iloc[rx,1])
print("Label:      ", train_target[rx])

ix:  21110
Premise:     Six men shielding their eyes from the sun
Hypothesis:  People shield their eyes from the bright light .
Label:       2


In [10]:
def validate_data(df):
    for ix in range(len(df)):
        if type(df.loc[ix, 'premise']) != str:
            print("Bad premise at index: ", ix)
        if type(df.loc[ix, 'hypothesis']) != str:
            print("Bad hypothesis at index: ", ix)
            
validate_data(train_data)
validate_data(val_data)

In [11]:
tokenizer = spacy.load("en_core_web_sm")

def remove_punctuation(tokens):
    return [token.text.lower() for token in tokens if token.text not in string.punctuation]
    
def tokenize(data):
    all_tokens = []
    data_tok = []
    for ix in tqdm.tqdm_notebook(range(len(data))):
        premise = remove_punctuation(tokenizer(data.loc[ix, 'premise']))
        hypothesis = remove_punctuation(tokenizer(data.loc[ix, 'hypothesis']))
        all_tokens += premise + hypothesis
        data_tok.append([premise, hypothesis])
    return data_tok, all_tokens

train_data_tok, all_tokens = tokenize(train_data)
val_data_tok, _ = tokenize(val_data)







In [46]:
train_data_tok = utils.load_pkl_data('snli_train_tok.p')

In [49]:
all_tokens = []
for premise, hypo in train_data_tok:
    all_tokens += premise + hypo

In [50]:
len(all_tokens)

2038281

In [25]:
utils.save_pkl_data(train_data_tok, 'snli_train_tok.p')
utils.save_pkl_data(all_tokens, 'snli_all_tokens.p')
utils.save_pkl_data(val_data_tok, 'snli_val_tok.p')

In [52]:
def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = np.array(tokens[1:], dtype=np.float32)
    return data

tok2vec = load_vectors('wiki-news-300d-1M.vec')

In [None]:
len(np.unique(all_tokens))

In [55]:
def build_vocab(all_tokens, tok2vec):
    counts = Counter(all_tokens).most_common()
    ind2tok = [PAD, UNK] + [token for token, count in counts if token in tok2vec]
    tok2ind = {tok: ind for ind, tok in enumerate(ind2tok)}
    ind2vec = np.zeros((len(ind2tok), emb_dim), dtype=np.float32)
    ind2vec[tok2ind[UNK], :] = np.random.uniform(low=-1, size=(emb_dim,))
    for ix, tok in tqdm.tqdm_notebook(enumerate(ind2tok)):
        if ix < 2:
            continue
        ind2vec[ix, :] = tok2vec[tok]
    return ind2tok, tok2ind, torch.from_numpy(ind2vec)
    
ind2tok, tok2ind, ind2vec = build_vocab(all_tokens, tok2vec)

utils.save_pkl_data(ind2tok, 'ind2tok.p', data_dir='vocab')
utils.save_pkl_data(tok2ind, 'tok2ind.p', data_dir='vocab')
utils.save_pkl_data(ind2vec, 'ind2vec.p', data_dir='vocab')




In [56]:
print(len(ind2tok))

18107


In [None]:
for ind2vec[:10,:50]

In [42]:
def data_indices(data):
    data_ind = []
    for ix, (premise, hypothesis) in enumerate(data):
        if ix % 5000 == 4999:
            print(f"[{ix}/100000]")
        premise_ind = [tok2ind[tok] if tok in tok2ind else tok2ind[UNK] for tok in premise]
        hypothesis_ind = [tok2ind[tok] if tok in tok2ind else tok2ind[UNK] for tok in hypothesis]
        data_ind.append([premise_ind, hypothesis_ind])
    return data_ind

train_ind = data_indices(train_data_tok)
val_ind = data_indices(val_data_tok)

[4999/100000]
[9999/100000]
[14999/100000]
[19999/100000]
[24999/100000]
[29999/100000]
[34999/100000]
[39999/100000]
[44999/100000]
[49999/100000]
[54999/100000]
[59999/100000]
[64999/100000]
[69999/100000]
[74999/100000]
[79999/100000]
[84999/100000]
[89999/100000]
[94999/100000]
[99999/100000]


In [44]:
for ix in range(len(train_ind)):
    assert(len(train_ind[ix][0]) == len(train_data_tok[ix][0]))
    assert(len(train_ind[ix][1]) == len(train_data_tok[ix][1]))

In [45]:
utils.save_pkl_data(train_ind, 'snli_train_ind.p')
utils.save_pkl_data(train_target, 'snli_train_target.p')
utils.save_pkl_data(val_ind, 'snli_val_ind.p')
utils.save_pkl_data(val_target, 'snli_val_target.p')

In [2]:
train_ind = utils.load_pkl_data('snli_train_ind.p')
train_target = utils.load_pkl_data('snli_train_target.p')
val_ind = utils.load_pkl_data('snli_val_ind.p')
val_target = utils.load_pkl_data('snli_val_target.p')

train_dataset = data.SNLIDataset(train_ind, train_target)
val_dataset = data.SNLIDataset(val_ind, val_target)

In [3]:
val_loader = DataLoader(val_dataset, batch_size=data.BATCH_SIZE, shuffle=False, collate_fn=data.collate_fn)

In [9]:
for ix, (premise, hypo, premise_len, hypo_len, targets) in enumerate(val_loader):
    print("premise: ", premise)
    print("hypo:", hypo)
    assert(premise.shape[1] == 85)
    print("-----")

premise:  tensor([[    48,     43,      7,  ...,      0,      0,      0],
        [   120,     14,    170,  ...,      0,      0,      0],
        [   757,   6794,     27,  ...,      0,      0,      0],
        ...,
        [    48,   9703,    388,  ...,      0,      0,      0],
        [     2,   1813,      4,  ...,      0,      0,      0],
        [     2,    229,    629,  ...,      0,      0,      0]])
hypo: tensor([[    49,      9,     13,  ...,      0,      0,      0],
        [  1001,     14,      9,  ...,      0,      0,      0],
        [    14,    308,    121,  ...,      0,      0,      0],
        ...,
        [     3,    388,      9,  ...,      0,      0,      0],
        [     3,   1813,      5,  ...,      0,      0,      0],
        [     2,    229,    629,  ...,      0,      0,      0]])
-----
premise:  tensor([[     2,      6,      8,  ...,      0,      0,      0],
        [  2517,     32,   1029,  ...,      0,      0,      0],
        [     2,     77,     29,  ...,      

premise:  tensor([[     2,     41,   1023,  ...,      0,      0,      0],
        [     4,   2299,      2,  ...,      0,      0,      0],
        [    49,      9,     13,  ...,      0,      0,      0],
        ...,
        [     2,     59,     26,  ...,      0,      0,      0],
        [    16,      2,     59,  ...,      0,      0,      0],
        [     2,    450,      6,  ...,      0,      0,      0]])
hypo: tensor([[     2,     29,   1023,  ...,      0,      0,      0],
        [     3,      6,      5,  ...,      0,      0,      0],
        [    13,     22,    384,  ...,      0,      0,      0],
        ...,
        [     3,     59,     26,  ...,      0,      0,      0],
        [     2,    111,      5,  ...,      0,      0,      0],
        [   181,      5,    210,  ...,      0,      0,      0]])
-----
premise:  tensor([[    14,    415,     82,  ...,      0,      0,      0],
        [     3,   8945,      4,  ...,      0,      0,      0],
        [    13,    102,      9,  ...,      

hypo: tensor([[     2,     21,      6,  ...,      0,      0,      0],
        [     3,   2604,      9,  ...,      0,      0,      0],
        [     2,    148,      5,  ...,      0,      0,      0],
        ...,
        [     2,      6,      5,  ...,      0,      0,      0],
        [     3,    109,      5,  ...,      0,      0,      0],
        [     2,     45,      8,  ...,      0,      0,      0]])
-----
premise:  tensor([[     2,    505,    206,  ...,      0,      0,      0],
        [     2,     35,     10,  ...,      0,      0,      0],
        [   803,      9,    541,  ...,      0,      0,      0],
        ...,
        [    13,     43,    380,  ...,      0,      0,      0],
        [     2,    692,      1,  ...,      0,      0,      0],
        [     2,      6,      4,  ...,      0,      0,      0]])
hypo: tensor([[   505,    613,     66,  ...,      0,      0,      0],
        [     3,     12,   3518,  ...,      0,      0,      0],
        [    49,      9,    803,  ...,      0,  

In [37]:
class GRU(nn.Module):
    def __init__(self, lookup_table, embed_size, hidden_size, num_layers, bidirectional):
        super().__init__()
        self.num_directions = 2 if bidirectional else 1
        self.hidden_size = hidden_size
        self.embed = nn.Embedding.from_pretrained(lookup_table)
        self.rnn = nn.GRU(embed_size, hidden_size, num_layers, batch_first=True, bidirectional=bidirectional)
    
    def init_state(self, batch_size):
        return torch.randn(self.num_layers * self.num_directions, batch_size, self.hidden_size)
    
    def forward(self, X, X_len):
        batch_size = X.size(0)
        embeddings = self.embed(X) # B, T, E
        h_0 = self.init_state(batch_size)
        hidden_states, _ = self.rnn(embeddings, h_0) #(B, T, D * H), (L * D, B, H)
        # (B, H)
        hidden = torch.FloatTensor(batch_size, self.num_directions * self.hidden_size) # (B, D * H)
        for ix in range(batch_size):
            hidden[ix, :] = hidden_states[ix, X_len[ix]-1, :]
        return hidden

In [58]:
ind2vec = utils.load_pkl_data('ind2vec.p', data_dir='vocab')
print(ind2vec)
embed_size = 300
hidden_size = 800
num_layers = 1
bidirectional = False

rnn = GRU(ind2vec, embed_size, hidden_size, num_layers, bidirectional)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.3023,  0.3331,  0.9758,  ...,  0.0808,  0.7496, -0.8975],
        [ 0.0047,  0.0223, -0.0087,  ...,  0.1479,  0.1324, -0.0318],
        ...,
        [ 0.0327, -0.0134, -0.0800,  ...,  0.0548,  0.1061,  0.0346],
        [-0.0277, -0.0269,  0.1038,  ...,  0.1449,  0.1653,  0.1201],
        [ 0.1161, -0.0482,  0.1281,  ...,  0.1275, -0.1913,  0.0195]])


In [None]:
for i, ()