In [1]:
import spacy
import string
import pickle as pkl
import pandas as pd
from collections import Counter
import random
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression

import tqdm
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import datasets
from sklearn.model_selection import train_test_split

from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
class Frozen_NN(nn.Module):
    """
    BagOfWords classification model
    """
    def __init__(self, vocab_size, drop):
        """
        @param vocab_size: size of the vocabulary. 
        @param emb_dim: size of the word embedding
        """
        super(Frozen_NN, self).__init__()
        # pay attention to padding_idx
        l1_dim = 100
        l2_dim = 100
        self.embed = nn.Embedding.from_pretrained(weight_np, freeze=False, padding_idx=0)
        
        self.hidden1 = nn.Linear(300*2, l1_dim)
        self.dropout = nn.Dropout(p=drop)
        self.hidden2 = nn.Linear(l1_dim, l2_dim)
        self.linear = nn.Linear(l2_dim, 3)
        
    def forward(self, data1, data2, length_1, length_2):
        """
        
        @param data1: matrix of size (batch_size, max_sentence_length). Each row in data represents a 
            review in sentence1 that is represented using n-gram index. Note that they are padded to have same length.
        @param data2: matrix of size (batch_size, max_sentence_length). Each row in data represents a 
            review in sentence2 that is represented using n-gram index. Note that they are padded to have same length.
        @param length1: an int tensor of size (batch_size), which represents the non-trivial (excludes padding)
            length of each sentences in the data1.
        @param length2: an int tensor of size (batch_size), which represents the non-trivial (excludes padding)
            length of each sentences in the data2.
        """
        out_1 = self.embed(data1)
        m = (data1 == 1)
        m = m.unsqueeze(2).repeat(1, 1, 300).type(torch.FloatTensor)
        out_1 = m * out_1 + (1-m) * out_1.clone().detach()
        out_1 = self.dropout(out_1)
        out_1 = torch.sum(out_1, dim=1)
        out_1 /= length_1.view(length_1.size()[0],1).expand_as(out_1).float()

        
        out_2 = self.embed(data2)
        m = (data2 == 1)
        m = m.unsqueeze(2).repeat(1, 1, 300).type(torch.FloatTensor)
        out_2 = m * out_2 + (1-m) * out_2.clone().detach()
        out_2 = self.dropout(out_2)
        out_2 = torch.sum(out_2, dim=1)
        out_2 /= length_2.view(length_2.size()[0],1).expand_as(out_2).float()
        
        out = torch.cat((out_1, out_2), dim=1)
        
        out = F.relu(self.hidden1(out.float()))
        out = F.relu(self.hidden2(out.float()))
        # return logits
        out = self.linear(out.float())
        return out

In [3]:
def mnli_pipeline(model_file,genre):
    mnli_train_df = pd.read_csv("./data/mnli_train.tsv",sep='\t')
    mnli_val_df = pd.read_csv("./data/mnli_val.tsv",sep='\t')
    mnli_train = mnli_train_df[mnli_train_df['genre']==genre]
    mnli_val = mnli_val_df[mnli_val_df['genre']==genre]
    mapping = {'neutral': 0, 'entailment': 1, 'contradiction': 2}
    mnli_train = mnli_train.applymap(lambda s: mapping.get(s) if s in mapping else s)
    mnli_val = mnli_val.applymap(lambda s: mapping.get(s) if s in mapping else s)
    train_target = list(mnli_train['label'])
    val_target = list(mnli_val['label'])
    
    mnli_train_tokens_1 = pkl.load(open("data/mnli_train_{}_tokens_1.p".format(genre), "rb"))
    mnli_train_tokens_2 = pkl.load(open("data/mnli_train_{}_tokens_2.p".format(genre), "rb"))
    mnli_train_all_tokens = pkl.load(open("data/mnli_train_{}_concat_tokens.p".format(genre), "rb"))

    mnli_val_tokens_1 = pkl.load(open("data/mnli_val_{}_tokens_1.p".format(genre), "rb"))
    mnli_val_tokens_2 = pkl.load(open("data/mnli_val_{}_tokens_2.p".format(genre), "rb"))
    
    max_vocab_size = 10000
    # save index 0 for unk and 1 for pad
    PAD_IDX = 0
    UNK_IDX = 1

    def build_vocab(all_tokens):
        # Returns:
        # id2token: list of tokens, where id2token[i] returns token that corresponds to token i
        # token2id: dictionary where keys represent tokens and corresponding values represent indices
        token_counter = Counter(all_tokens)
        vocab, count = zip(*token_counter.most_common(max_vocab_size))
        id2token = list(vocab) #what token is assigned to a number
        token2id = dict(zip(vocab, range(2,2+len(vocab)))) 
        id2token = ['<pad>', '<unk>'] + id2token
        token2id['<pad>'] = PAD_IDX 
        token2id['<unk>'] = UNK_IDX
        return token2id, id2token

    token2id, id2token = build_vocab(mnli_train_all_tokens)
    
    # convert token to id in the dataset
    def token2index_dataset(tokens_data):
        indices_data = []
        for tokens in tokens_data:
            index_list = [token2id[token] if token in token2id else UNK_IDX for token in tokens]
            indices_data.append(index_list)
        return indices_data

    train_data_indices1 = token2index_dataset(mnli_train_tokens_1)
    train_data_indices2 = token2index_dataset(mnli_train_tokens_2)

    val_data_indices1 = token2index_dataset(mnli_val_tokens_1)
    val_data_indices2 = token2index_dataset(mnli_val_tokens_2)
    
    #param to be tuned
    MAX_SENTENCE_LENGTH = 100
    BATCH_SIZE = 32
    
    class BuildDataset(Dataset):

        def __init__(self, data_list1, data_list2, target_list):

            self.data_list1 = data_list1
            self.data_list2 = data_list2
            self.target_list = target_list
            assert (len(self.data_list1) == len(self.target_list))

        def __len__(self):
            return len(self.data_list1)

        def __getitem__(self, key):
            """
            Triggered when you call dataset[i]
            """ 
            token_idx1 = self.data_list1[key][:MAX_SENTENCE_LENGTH]
            token_idx2 = self.data_list2[key][:MAX_SENTENCE_LENGTH]
            label = self.target_list[key]
            return [token_idx1, len(token_idx1), token_idx2, len(token_idx2), label]
        
    
    train_dataset = BuildDataset(train_data_indices1, train_data_indices2, train_target)
    
    def collate_func(batch):
        """
        Customized function for DataLoader that dynamically pads the batch so that all 
        data have the same length
        """
        data_list1 = []
        data_list2 = []
        label_list = []
        length_list1 = []
        length_list2 = []

        for datum in batch:
            label_list.append(datum[4])
            length_list1.append(datum[1])
            length_list2.append(datum[3])
        # padding
        for datum in batch:
            padded_vec1 = np.pad(np.array(datum[0]), 
                                    pad_width=((0,MAX_SENTENCE_LENGTH-datum[1])), #pad with 0
                                    mode="constant", constant_values=0)

            padded_vec2 = np.pad(np.array(datum[2]), 
                                    pad_width=((0,MAX_SENTENCE_LENGTH-datum[3])), #pad with 0
                                    mode="constant", constant_values=0)

            data_list1.append(padded_vec1)
            data_list2.append(padded_vec2)
        return [torch.from_numpy(np.array(data_list1)), torch.from_numpy(np.array(data_list2)),
                torch.LongTensor(length_list1), torch.LongTensor(length_list2),
                torch.LongTensor(label_list)]

    train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                               batch_size=BATCH_SIZE,
                                               collate_fn=collate_func,
                                               shuffle=True)

    val_dataset = BuildDataset(val_data_indices1, val_data_indices2, val_target)
    val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                               batch_size=BATCH_SIZE,
                                               collate_fn=collate_func,
                                               shuffle=True)
      
    def test_model(loader, model):
        """
        Help function that tests the model's performance on a dataset
        @param: loader - data loader for the dataset to test against
        """
        correct = 0
        total = 0
        model.eval()
        for data1, data2, len1, len2, labels in loader:
            data_batch1, data_batch2, len_batch1, len_batch2,label_batch = data1, data2, len1, len2, labels
            outputs = F.softmax(model(data_batch1, data_batch2, len_batch1, len_batch2), dim=1)
            predicted = outputs.max(1, keepdim=True)[1]

            total += labels.size(0)
            correct += predicted.eq(labels.view_as(predicted)).sum().item()
        return (100 * correct / total)

    model = pkl.load(open(model_file, "rb"))
    val_acc = test_model(val_loader, model)
    return genre,val_acc

In [4]:
model = "finalized_frozen_embed_model.sav"
genre_list = ['travel','fiction', 'government', 'slate', 'telephone']
for i in genre_list:
    print(mnli_pipeline(model,i))

('travel', 31.16089613034623)
('fiction', 34.57286432160804)
('government', 31.594488188976378)
('slate', 33.63273453093812)
('telephone', 32.33830845771144)
