# Training on SNLI

In [2]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from collections import Counter
import pickle as pkl
import random
import pdb
from sklearn.model_selection import train_test_split
import pandas as pd
random.seed(134)

PAD_IDX = 0
UNK_IDX = 1
BATCH_SIZE = 32

In [15]:
def label2int(labels):
    list = []
    for word in labels:
        if word == 'entailment':
            list.append(0)
        elif word == 'contradiction':
            list.append(1)
        else:
            list.append(2)
    return list
  
def tokenize(strings):
    new = []
    for line in strings:
        split = line.split()
        split[0] = split[0].lower()
        new.append(split)
    return new

data = pd.read_table("snli_train.tsv")
val_data = pd.read_table("snli_val.tsv")

# data = data.iloc[:100,:]
# val_data = val_data.iloc[:20,:]

X = data.drop(['label'],axis=1)
y = label2int(data['label'])
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

X_train = data.drop(['label'],axis=1)
y_train = label2int(data['label'])

stc1_train = tokenize(X_train['sentence1'])
stc2_train = tokenize(X_train['sentence2'])
# stc1_test = tokenize(X_test['sentence1'])
# stc2_test = tokenize(X_test['sentence2'])
stc1_val = tokenize(val_data['sentence1'])
stc2_val = tokenize(val_data['sentence2'])
y_val = label2int(val_data['label'])

Build Vocab

In [4]:
import io

def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    cnt = 0
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = map(float, tokens[1:])
        cnt = cnt + 1
        if cnt > 50000:
            break      
    return data

embedding_vectors = load_vectors('wiki-news-300d-1M.vec')

def build_vocab(all_vectors):
    # Returns:
    # id2token: list of tokens, where id2token[i] returns token that corresponds to token i
    # token2id: dictionary where keys represent tokens and corresponding values represent indices
    max_len = max([len(key) for key in all_vectors])
    vocab = all_vectors.keys()
    id2token = list(vocab)
    token2id = dict(zip(vocab, range(2,2+len(vocab)))) 
    id2token = ['<pad>', '<unk>'] + id2token
    token2id['<pad>'] = PAD_IDX 
    token2id['<unk>'] = UNK_IDX
    return token2id, id2token, max_len

token2id, id2token, max_len = build_vocab(embedding_vectors)

In [5]:
# convert token to id in the dataset
def token2index_dataset(tokens_data):
    indices_data = []
    for tokens in tokens_data:
        index_list = [token2id[token] if token in token2id else UNK_IDX for token in tokens]
        indices_data.append(index_list)
    return indices_data

stc1_train_indices = token2index_dataset(stc1_train)
stc2_train_indices = token2index_dataset(stc2_train)
stc1_val_indices = token2index_dataset(stc1_val)
stc2_val_indices = token2index_dataset(stc2_val)
stc1_test_indices = token2index_dataset(stc1_test)
stc2_test_indices = token2index_dataset(stc2_test)

# double checking
print ("Train dataset size is {}".format(len(stc1_train_indices)))
print ("Val dataset size is {}".format(len(stc1_val_indices)))
# print ("Test dataset size is {}".format(len(stc1_test_indices)))

Train dataset size is 100000
Val dataset size is 1000


In [6]:
MAX_SENTENCE_LENGTH = max([len(sentence) for sentence in stc1_train])

import numpy as np
import torch
from torch.utils.data import Dataset

class SNLIDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """
    def __init__(self, stc1_data_list, stc2_data_list, target_list):
        """
        @param data_list: list of newsgroup tokens 
        @param target_list: list of newsgroup targets 

        """
        self.stc1_data_list = stc1_data_list
        self.stc2_data_list = stc2_data_list
        self.target_list = target_list

        assert (len(self.stc1_data_list) == len(self.target_list))
        assert (len(self.stc1_data_list) == len(self.stc2_data_list))

    def __len__(self):
        return len(self.stc1_data_list)
        
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        stc1_token_idx = self.stc1_data_list[key][:MAX_SENTENCE_LENGTH]
        stc2_token_idx = self.stc2_data_list[key][:MAX_SENTENCE_LENGTH]
        label = self.target_list[key]
#         print(len(stc1_token_idx),len(stc2_token_idx))
        return [stc1_token_idx, stc2_token_idx, len(stc1_token_idx), len(stc2_token_idx), label]

def SNLI_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    data_list1 = []
    data_list2 = []
    label_list = []
    length_list = []

    for datum in batch:
        label_list.append(datum[4])
        length_list.append(datum[2])
    # padding
    for datum in batch:
        padded_vec1 = np.pad(np.array(datum[0]),
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[2])),
                                mode="constant", constant_values=0)
        data_list1.append(padded_vec1)
        padded_vec2 = np.pad(np.array(datum[1]),
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[3])),
                                mode="constant", constant_values=0)
        data_list2.append(padded_vec2)    
        
#     sorting decreasing order
    ind_dec_order = np.argsort(length_list)[::-1]
    data_list1 = np.array(data_list1)[ind_dec_order]
    data_list2 = np.array(data_list2)[ind_dec_order]
    length_list = np.array(length_list)[ind_dec_order]
    label_list = np.array(label_list)[ind_dec_order]  
    return [torch.from_numpy(np.array(data_list1)), torch.from_numpy(np.array(data_list2)),torch.LongTensor(length_list),torch.LongTensor(label_list)]


BATCH_SIZE = 32
train_dataset = SNLIDataset(stc1_train_indices, stc2_train_indices, y_train)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=SNLI_collate_func,
                                           shuffle=True)

val_dataset = SNLIDataset(stc1_val_indices,stc2_val_indices, y_val)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=SNLI_collate_func,
                                           shuffle=True)

test_dataset = SNLIDataset(stc1_test_indices, stc2_test_indices, y_test)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=SNLI_collate_func,
                                           shuffle=False)

## RNN class

In [13]:
class RNN(nn.Module):
    def __init__(self, emb_size, hidden_size, num_layers, num_classes, vocab_size):
        # RNN Accepts the following hyperparams:
        # emb_size: Embedding Size
        # hidden_size: Hidden Size of layer in RNN
        # num_layers: number of layers in RNN
        # num_classes: number of output classes
        # vocab_size: vocabulary size
        #      expected_hidden_size = (self.num_layers * num_directions, mini_batch, self.hidden_size)
        # output (seq_len, batch, hidden_size * num_directions)
        # h_n (num_layers * num_directions, batch, hidden_size)
        super(RNN, self).__init__()
        self.emb_size = emb_size
        self.num_layers, self.hidden_size = num_layers, hidden_size
        self.embedding = nn.Embedding(vocab_size, emb_size, padding_idx=PAD_IDX)
        # what is batch dimension
        # use following line for bidirectional GRU with num_layers = 1 dropout =1  
        self.rnn = nn.GRU(emb_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.linear = nn.Linear(2*hidden_size, num_classes)
        self.dropout = nn.Dropout(p=0.2)

    def init_hidden(self, batch_size):
        # Function initializes the activation of recurrent neural net at timestep 0
        # Needs to be in format (num_layers*num_directions, batch_size, hidden_size)
        hidden = torch.randn(2*self.num_layers, batch_size, self.hidden_size)

        return hidden

    def forward(self, x, lengths):
        # reset hidden state

        batch_size, seq_len = x.size()
# implement the rnn from tensor 
        self.hidden = self.init_hidden(batch_size)

        # get embedding of characters
        embed = self.embedding(x)
        m = (x == 1).type(torch.FloatTensor)
        m = m.unsqueeze(2).repeat(1, 1, self.emb_size)
        embed = m * embed + (1-m) * embed.clone().detach()
        # pack padded sequence
        # sequence length in descending order
        embed = torch.nn.utils.rnn.pack_padded_sequence(embed, lengths.numpy(), batch_first=True)
        # fprop though RNN
        # rnn_out: batch size*sequence length*hidden dim
        rnn_out, self.hidden = self.rnn(embed, self.hidden)
        # undo packing
        rnn_out, _ = torch.nn.utils.rnn.pad_packed_sequence(rnn_out, batch_first=True)
# ends
        self.hidden = self.dropout(self.hidden)
        # the last hidden state 
        logits = self.hidden[0]
        return logits

## Training on RNN

In [14]:
# accuracy chart
chart = pd.DataFrame()
model = RNN(emb_size=100, hidden_size=100, num_layers=1, num_classes=3, vocab_size=len(id2token))
# suppose in_features = 66
rfc_model = nn.Sequential(nn.Linear(200, 200), nn.ReLU(inplace=True), nn.Linear(200, 3))

learning_rate = 3e-4
num_epochs = 10 # number epoch to train

# Criterion and Optimizer
# CrossEntropyLoss() combines nn.LogSoftmax() and nn.NLLLoss() in one single class
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Train the model
total_step = len(train_loader)

def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for data1, data2, lengths, labels in loader:
        data_batch1, data_batch2, lengths_batch, label_batch = data1, data2, lengths, labels
        output1 = model(data_batch1, lengths_batch)
        output2 = model(data_batch2, lengths_batch)
        outputs = torch.cat([output1, output2], dim=1)
        outputs = rfc_model(outputs)
        loss = criterion(outputs, labels)
        predicted = outputs.max(1, keepdim=True)[1]
        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total), loss

for epoch in range(num_epochs):
    total = 0
    correct = 0
    for i, (data1,data2,lengths,labels) in enumerate(train_loader):
        outputs = 0
        model.train()
        optimizer.zero_grad()
        # Forward pass
        # [32, 400]
        output1 = model(data1,lengths)
        output2 = model(data2,lengths)     
        outputs = torch.cat([output1, output2], dim=1)
#         torch.Size([32, 400])
        outputs = rfc_model(outputs)
#     torch.Size([32, 3])
        loss = criterion(outputs, labels)
     
        # Backward and optimize
        loss.backward()
        optimizer.step()
        
#         # validate every 100 iterations
#         if i > 0 and i % 100 == 0:
    # validate
    val_acc, val_loss = test_model(val_loader, model)    
    predicted = outputs.max(1, keepdim=True)[1]
    total += labels.size(0)
    correct += predicted.eq(labels.view_as(predicted)).sum().item()
    train_acc = 100 * correct / total
    chart = chart.append(pd.Series([loss.item(), val_loss.item(), train_acc,val_acc]),ignore_index=True)    
    print('Epoch: [{}/{}], Step: [{}/{}],train acc: {}, Validation Acc: {}'.format(
                       epoch+1, num_epochs, i+1, len(train_loader), train_acc,val_acc))  
    
    
chart.columns = ['train loss','val loss', 'train acc', 'val acc']


Epoch: [1/10], Step: [3125/3125],train acc: 46.875, Validation Acc: 52.0
Epoch: [2/10], Step: [3125/3125],train acc: 50.0, Validation Acc: 55.6
Epoch: [3/10], Step: [3125/3125],train acc: 56.25, Validation Acc: 57.6
Epoch: [4/10], Step: [3125/3125],train acc: 68.75, Validation Acc: 59.4
Epoch: [5/10], Step: [3125/3125],train acc: 53.125, Validation Acc: 59.4
Epoch: [6/10], Step: [3125/3125],train acc: 65.625, Validation Acc: 60.9
Epoch: [7/10], Step: [3125/3125],train acc: 62.5, Validation Acc: 60.2
Epoch: [8/10], Step: [3125/3125],train acc: 65.625, Validation Acc: 61.5
Epoch: [9/10], Step: [3125/3125],train acc: 59.375, Validation Acc: 59.3
Epoch: [10/10], Step: [3125/3125],train acc: 65.625, Validation Acc: 60.7


In [28]:
# rnn, hidden_size = 100, no dropout
chart1

Unnamed: 0,train loss,val loss,train acc,val acc
0,0.899823,0.994854,71.875,52.3
1,0.919549,0.893461,62.5,57.1
2,0.848188,0.768986,62.5,58.4
3,0.783004,0.704093,62.5,59.9
4,0.780601,0.783618,68.75,59.4
5,0.682274,1.044791,75.0,60.2
6,0.826559,0.771843,68.75,60.7
7,0.832078,1.279301,59.375,59.9
8,0.85919,1.118668,68.75,59.6
9,0.95702,1.126307,50.0,58.6


In [17]:
# rnn, hidden_size = 100, has dropout
chart2 = chart
chart2

Unnamed: 0,train loss,val loss,train acc,val acc
0,0.962078,1.161669,46.875,52.0
1,1.042803,1.023585,50.0,55.6
2,0.959237,0.947626,56.25,57.6
3,0.818148,1.052669,68.75,59.4
4,0.968561,0.779647,53.125,59.4
5,0.81541,0.431272,65.625,60.9
6,0.936348,1.098801,62.5,60.2
7,0.826626,0.934144,65.625,61.5
8,0.80618,1.267943,59.375,59.3
9,0.852467,0.839534,65.625,60.7


## CNN class

In [25]:
class CNN(nn.Module):
    def __init__(self, emb_size, hidden_size, num_layers, num_classes, vocab_size):

        super(CNN, self).__init__()

        self.num_layers, self.hidden_size = num_layers, hidden_size
        self.embedding = nn.Embedding(vocab_size, emb_size, padding_idx=PAD_IDX)
    
        self.conv1 = nn.Conv1d(emb_size, hidden_size, kernel_size=2, padding=1)
        self.conv2 = nn.Conv1d(hidden_size, hidden_size, kernel_size=2, padding=1)
        self.dropout = nn.Dropout(p=0.2)

        self.linear = nn.Linear(hidden_size, num_classes)

    def forward(self, x, lengths):
        batch_size, seq_len = x.size()

        embed = self.embedding(x)
#         the conv1 layers requires input in batch_size*hidden_size*seq_len
        hidden = self.conv1(embed.transpose(1,2)).transpose(1,2)
#         transfer 3d tensor to 2d tensor by merging 0
#         hidden = F.relu(hidden.contiguous().view(-1, hidden.size(-1))).view(batch_size, seq_len, hidden.size(-1))
        hidden = self.dropout(hidden)
        hidden = F.relu(hidden)
        hidden = self.conv2(hidden.transpose(1,2)).transpose(1,2)
        hidden = self.dropout(hidden)
#         hidden = F.relu(hidden.contiguous().view(-1, hidden.size(-1))).view(batch_size, seq_len, hidden.size(-1))
        hidden = F.relu(hidden)   
#       doing max pool instead
        hidden = hidden.max(dim=1)
        a = torch.FloatTensor(hidden[0])
        return a

## Training on CNN

In [None]:
chart = pd.DataFrame()
cmodel = CNN(emb_size=100, hidden_size=100, num_layers=2, num_classes=3, vocab_size=len(id2token))
# suppose in_features = 400
fc_model = nn.Sequential(nn.Linear(200, 200), nn.ReLU(inplace=True), nn.Linear(200, 3))

learning_rate = 3e-4
num_epochs = 10 # number epoch to train

# Criterion and Optimizer
# CrossEntropyLoss() combines nn.LogSoftmax() and nn.NLLLoss() in one single class
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(cmodel.parameters(), lr=learning_rate)

# Train the model
total_step = len(train_loader)

def test_cmodel(loader, cmodel):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    for data1, data2, lengths, labels in loader:
        total = 0
        correct = 0
        data_batch1, data_batch2, lengths_batch, label_batch = data1, data2, lengths, labels
        output1 = cmodel(data_batch1, lengths_batch)
        output2 = cmodel(data_batch2, lengths_batch)
        outputs = torch.cat([output1, output2], dim=1)
#         print(outputs.size())
        outputs = fc_model(outputs)
        loss = criterion(outputs, labels)
        predicted = outputs.max(1, keepdim=True)[1]
        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()          
    return (100 * correct / total), loss


for epoch in range(num_epochs):
    total = 0
    correct = 0
    for i, (data1,data2,lengths,labels) in enumerate(train_loader):
        outputs = 0
        cmodel.train()
        optimizer.zero_grad()
        # Forward pass
        output1 = cmodel(data1,lengths)
        output2 = cmodel(data2,lengths)
        outputs = torch.cat([output1, output2], dim=1)
#       output1: torch.Size([32, 200]), outputs: torch.Size([32, 3])
        outputs = fc_model(outputs)
        loss = criterion(outputs, labels)

        # Backward and optimize
        loss.backward()
        optimizer.step()
    # validate
    val_acc, val_loss = test_cmodel(val_loader, cmodel)    
    predicted = outputs.max(1, keepdim=True)[1]
    total += labels.size(0)
    correct += predicted.eq(labels.view_as(predicted)).sum().item()
    train_acc = 100 * correct / total
    chart = chart.append(pd.Series([loss.item(), val_loss.item(), train_acc,val_acc]),ignore_index=True)    
    print('Epoch: [{}/{}], Step: [{}/{}],train acc: {}, Validation Acc: {}'.format(
                       epoch+1, num_epochs, i+1, len(train_loader), train_acc,val_acc))  
     
chart.columns = ['train loss','val loss', 'train acc', 'val acc']


In [53]:
stc1_val = tokenize(val_data['sentence1'])
stc2_val = tokenize(val_data['sentence2'])
y_val = label2int(val_data['label'])
print(val_data.iloc[97,:])

sentence1    A black dog running through the forest .
sentence2                     A dog playing outside .
label                                      entailment
Name: 97, dtype: object


In [None]:
#pull out incorrect & correct samples
val2_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                           batch_size=1,
                                           collate_fn=SNLI_collate_func,
                                           shuffle=True)
cnt = 0
for data1, data2, lengths, labels in val2_loader:
        total = 0
        correct = 0
        data_batch1, data_batch2, lengths_batch, label_batch = data1, data2, lengths, labels
        output1 = cmodel(data_batch1, lengths_batch)
        output2 = cmodel(data_batch2, lengths_batch)
        outputs = torch.cat([output1, output2], dim=1)
        outputs = fc_model(outputs)
        loss = criterion(outputs, labels)
        predicted = outputs.max(1, keepdim=True)[1]
        print(cnt, predicted, labels)
        total += labels.size(0)
        cnt = cnt +1
        if cnt >100:
            break

In [12]:
# mode 2
chart3

Unnamed: 0,train loss,val loss,train acc,val acc
0,0.944406,0.982141,58.333333,57.5
1,0.82774,0.909768,62.5,60.5
2,0.874666,0.917046,54.166667,60.3
3,0.634986,0.857405,75.0,62.0
4,0.650674,0.515622,70.833333,62.5
5,0.96141,0.519126,66.666667,62.6
6,0.658704,0.755453,75.0,63.1
7,0.755238,0.738587,62.5,63.7
8,0.317213,0.866739,87.5,62.3
9,0.670436,0.942639,75.0,63.1


In [8]:
# mode 3
chart4

Unnamed: 0,train loss,val loss,train acc,val acc
0,0.830431,0.815015,54.166667,57.8
1,0.76654,0.666479,62.5,60.1
2,0.966648,0.63054,54.166667,61.4
3,0.831226,0.846666,58.333333,63.0
4,0.666718,1.890501,75.0,63.2
5,0.595547,1.344122,83.333333,63.0
6,0.651425,1.344152,70.833333,63.2
7,0.482722,0.713448,75.0,63.2
8,0.479897,1.153279,83.333333,63.7
9,0.603719,0.892604,79.166667,62.7


In [11]:
# mode 4
chart5 = chart
chart5

Unnamed: 0,train loss,val loss,train acc,val acc
0,0.857222,0.765864,65.625,58.0
1,0.877461,0.903923,50.0,61.2
2,0.874572,1.027983,59.375,63.5
3,0.856565,0.751945,62.5,62.7
4,0.545737,1.270057,78.125,64.0
5,0.877181,0.743008,56.25,63.2
6,0.714999,0.759234,62.5,63.5
7,0.715825,0.628397,68.75,64.1
8,0.701119,0.507828,68.75,63.7
9,0.96713,0.698065,62.5,64.4


# Evaluating on MultiNLI

In [20]:
def read(data):
    mstc1_val = tokenize(data['sentence1'])
    mstc2_val = tokenize(data['sentence2'])
    my_val = label2int(data['label'])
    # convert token to ID
    mstc1_val_indices = token2index_dataset(mstc1_val)
    mstc2_val_indices = token2index_dataset(mstc2_val)
    # double checking
    print ("Val dataset size is {}".format(len(mstc1_val_indices)))
    
    mval_dataset = SNLIDataset(mstc1_val_indices,mstc2_val_indices, my_val)
    mval_loader = torch.utils.data.DataLoader(dataset=mval_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=SNLI_collate_func,
                                           shuffle=True)
    return mval_loader


def evaluate(loader):
    #for RNN
    r_acc, r_loss = test_model(loader, model)
    #for CNN
    c_acc, c_loss = test_cmodel(loader, cmodel)
    return pd.Series([r_loss.item(),c_loss.item(),r_acc,c_acc])

In [21]:
mval_data = pd.read_table('mnli_val.tsv')
genres = set(mval_data['genre'])
metric = pd.DataFrame()

for genre in genres:
    group = mval_data[mval_data['genre'] == genre].drop(['genre'],axis = 1)
    loader = read(group)
    scores = evaluate(loader)
    metric =  metric.append(scores,ignore_index=True)

Val dataset size is 1016
Val dataset size is 1002
Val dataset size is 995
Val dataset size is 982
Val dataset size is 1005


In [22]:
metric.index = genres
metric.columns = ['RNN loss', 'CNN loss', 'RNN acc','CNN acc']
metric

Unnamed: 0,RNN loss,CNN loss,RNN acc,CNN acc
government,1.194837,1.447474,37.598425,41.929134
slate,1.267342,1.219725,38.622754,40.818363
fiction,1.995804,0.991552,35.879397,40.301508
travel,1.391345,0.904735,38.289206,41.955193
telephone,1.428192,1.381033,38.109453,42.686567
