In [1]:
import numpy as np
import pandas as pd
import spacy
import string
import pickle as pkl
from collections import Counter
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset

In [2]:
mnli_train_df = pd.read_csv('data/mnli_train.tsv','\t')
mnli_val_df = pd.read_csv('data/mnli_val.tsv','\t')

In [6]:
# tokenizer = spacy.load('en_core_web_sm')
# punctuations = string.punctuation

# # lowercase and remove punctuation
# def tokenize(sent):
#     tokens = tokenizer(sent)
#     return [token.text.lower() for token in tokens if (token.text not in punctuations)]

# # tokenize datasets
# # dataset1 contains sentence1, dataset2 contains sentence2
# def tokenize_dataset(dataset1, dataset2):
#     token_dataset1 = []
#     token_dataset2 = []
#     # we are keeping track of all tokens in dataset 
#     # in order to create vocabulary later
#     all_tokens = []
    
#     for sample in dataset1:
#         tokens = tokenize(sample)
#         token_dataset1.append(tokens)
#         all_tokens += tokens
        
#     for sample in dataset2:
#         tokens = tokenize(sample)
#         token_dataset2.append(tokens)
#         all_tokens += tokens
#     return token_dataset1, token_dataset2, all_tokens


# train_sen1_tokens = mnli_train_travel['sentence1'].tolist()
# train_sen2_tokens = mnli_train_travel['sentence2'].tolist()

# val_sen1_tokens = mnli_val_travel['sentence1'].tolist()
# val_sen2_tokens = mnli_val_travel['sentence2'].tolist()

# train_target = mnli_train_travel['label'].tolist()
# val_target = mnli_val_travel['label'].tolist()

# # train set tokens
# print("Tokenizing train data separately (sentence 1 and sentence 2)")
# train_data_tokens_1, train_data_tokens_2, all_train_tokens = tokenize_dataset(train_sen1_tokens, train_sen2_tokens)
# pkl.dump(train_data_tokens_1, open("data/mnli_train_travel_tokens_1.p", "wb"))
# pkl.dump(train_data_tokens_2, open("data/mnli_train_travel_tokens_2.p", "wb"))
# pkl.dump(all_train_tokens, open("data/mnli_train_travel_concat_tokens.p", "wb"))

# # val set tokens
# print("Tokenizing val data")
# val_data_tokens_1, val_data_tokens_2, _ = tokenize_dataset(val_sen1_tokens, val_sen2_tokens)
# pkl.dump(val_data_tokens_1, open("data/mnli_val_travel_tokens_1.p", "wb"))
# pkl.dump(val_data_tokens_2, open("data/mnli_val_travel_tokens_2.p", "wb"))

# print("finish tokenizing all data.")

In [5]:
class LogisticRegressionPyTorch(nn.Module):

    def __init__(self, vocab_size, emb_dim, n_out, inter):
        """
        n_in: Number of features
        n_out: Number of output classes
        """
        # Initialize the parent class - this is a Python requirement
        super().__init__()

        # Set up out linear layer. This initializes the weights
        # Note that self.linear is itself a nn.Module, nested within
        #   this module
        self.embed = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.inter = inter
        if self.inter == 'cat':
            self.linear = nn.Linear(emb_dim*2, n_out)
        elif self.inter == 'mul':
            self.linear = nn.Linear(emb_dim, n_out)


        # Explicitly initialize the weights with the initialization
        #   scheme we want.
        self.init_weights()

    def forward(self, data_1, data_2, length_1, length_2):
        """
        x: Input data [N, k]
        ---
        Returns: log probabilities of each class [N, c]
        """
        # Apply the linear function to get our logit (real numbers)
        out_1 = self.embed(data_1)
        out_1 = torch.sum(out_1, dim=1)
        out_1 /= length_1.view(length_1.size()[0],1).expand_as(out_1).float()

        out_2 = self.embed(data_2)
        out_2 = torch.sum(out_2, dim=1)
        out_2 /= length_2.view(length_2.size()[0],1).expand_as(out_2).float()

        if self.inter == 'cat':
            out = torch.cat((out_1, out_2), dim=1)
        elif self.inter == 'mul':
            out = torch.mul(out_1, out_2)

        out = self.linear(out)

        # Apply log_softmax to get logs of normalized probabilities
        return F.log_softmax(out, dim=1)
    
    def init_weights(self):
        # Use some specific initialization schemes
        nn.init.xavier_normal_(self.linear.weight)
        nn.init.uniform_(self.linear.bias)

In [8]:
def mnli_pipeline(model_file,genre):
    mnli_train = mnli_train_df[mnli_train_df['genre']==genre]
    mnli_val = mnli_val_df[mnli_val_df['genre']==genre]
    mapping = {'neutral': 0, 'entailment': 1, 'contradiction': 2}
    mnli_train = mnli_train.applymap(lambda s: mapping.get(s) if s in mapping else s)
    mnli_val = mnli_val.applymap(lambda s: mapping.get(s) if s in mapping else s)
    train_target = list(mnli_train['label'])
    val_target = list(mnli_val['label'])
    
    mnli_train_tokens_1 = pkl.load(open("data/mnli_train_{}_tokens_1.p".format(genre), "rb"))
    mnli_train_tokens_2 = pkl.load(open("data/mnli_train_{}_tokens_2.p".format(genre), "rb"))
    mnli_train_all_tokens = pkl.load(open("data/mnli_train_{}_concat_tokens.p".format(genre), "rb"))

    mnli_val_tokens_1 = pkl.load(open("data/mnli_val_{}_tokens_1.p".format(genre), "rb"))
    mnli_val_tokens_2 = pkl.load(open("data/mnli_val_{}_tokens_2.p".format(genre), "rb"))
    
    max_vocab_size = 8000
    # save index 0 for unk and 1 for pad
    PAD_IDX = 0
    UNK_IDX = 1

    def build_vocab(all_tokens):
        # Returns:
        # id2token: list of tokens, where id2token[i] returns token that corresponds to token i
        # token2id: dictionary where keys represent tokens and corresponding values represent indices
        token_counter = Counter(all_tokens)
        vocab, count = zip(*token_counter.most_common(max_vocab_size))
        id2token = list(vocab) #what token is assigned to a number
        token2id = dict(zip(vocab, range(2,2+len(vocab)))) 
        id2token = ['<pad>', '<unk>'] + id2token
        token2id['<pad>'] = PAD_IDX 
        token2id['<unk>'] = UNK_IDX
        return token2id, id2token

    token2id, id2token = build_vocab(mnli_train_all_tokens)
    
    # convert token to id in the dataset
    def token2index_dataset(tokens_data):
        indices_data = []
        for tokens in tokens_data:
            index_list = [token2id[token] if token in token2id else UNK_IDX for token in tokens]
            indices_data.append(index_list)
        return indices_data

    train_data_indices1 = token2index_dataset(mnli_train_tokens_1)
    train_data_indices2 = token2index_dataset(mnli_train_tokens_2)

    val_data_indices1 = token2index_dataset(mnli_val_tokens_1)
    val_data_indices2 = token2index_dataset(mnli_val_tokens_2)
    
    #param to be tuned
    MAX_SENTENCE_LENGTH = 100
    BATCH_SIZE = 32
    
    class BuildDataset(Dataset):

        def __init__(self, data_list1, data_list2, target_list):

            self.data_list1 = data_list1
            self.data_list2 = data_list2
            self.target_list = target_list
            assert (len(self.data_list1) == len(self.target_list))

        def __len__(self):
            return len(self.data_list1)

        def __getitem__(self, key):
            """
            Triggered when you call dataset[i]
            """ 
            token_idx1 = self.data_list1[key][:MAX_SENTENCE_LENGTH]
            token_idx2 = self.data_list2[key][:MAX_SENTENCE_LENGTH]
            label = self.target_list[key]
            return [token_idx1, len(token_idx1), token_idx2, len(token_idx2), label]
        
    
    train_dataset = BuildDataset(train_data_indices1, train_data_indices2, train_target)
    
    def collate_func(batch):
        """
        Customized function for DataLoader that dynamically pads the batch so that all 
        data have the same length
        """
        data_list1 = []
        data_list2 = []
        label_list = []
        length_list1 = []
        length_list2 = []

        for datum in batch:
            label_list.append(datum[4])
            length_list1.append(datum[1])
            length_list2.append(datum[3])
        # padding
        for datum in batch:
            padded_vec1 = np.pad(np.array(datum[0]), 
                                    pad_width=((0,MAX_SENTENCE_LENGTH-datum[1])), #pad with 0
                                    mode="constant", constant_values=0)

            padded_vec2 = np.pad(np.array(datum[2]), 
                                    pad_width=((0,MAX_SENTENCE_LENGTH-datum[3])), #pad with 0
                                    mode="constant", constant_values=0)

            data_list1.append(padded_vec1)
            data_list2.append(padded_vec2)
        return [torch.from_numpy(np.array(data_list1)), torch.from_numpy(np.array(data_list2)),
                torch.LongTensor(length_list1), torch.LongTensor(length_list2),
                torch.LongTensor(label_list)]

    train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                               batch_size=BATCH_SIZE,
                                               collate_fn=collate_func,
                                               shuffle=True)

    val_dataset = BuildDataset(val_data_indices1, val_data_indices2, val_target)
    val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                               batch_size=BATCH_SIZE,
                                               collate_fn=collate_func,
                                               shuffle=True)
      
    def test_model(loader, model):
        """
        Help function that tests the model's performance on a dataset
        @param: loader - data loader for the dataset to test against
        """
        correct = 0
        total = 0
        model.eval()
        for data1, data2, len1, len2, labels in loader:
            data_batch1, data_batch2, len_batch1, len_batch2,label_batch = data1, data2, len1, len2, labels
            outputs = F.softmax(model(data_batch1, data_batch2, len_batch1, len_batch2), dim=1)
            predicted = outputs.max(1, keepdim=True)[1]

            total += labels.size(0)
            correct += predicted.eq(labels.view_as(predicted)).sum().item()
        return (100 * correct / total)

    model = LogisticRegressionPyTorch(len(id2token), emb_dim, 3,'mul')
    model.load_state_dict(torch.load(model_file))
    val_acc = test_model(val_loader, model)
    return val_acc


In [13]:
emb_dim = 100
interact = 'mul'
model_file = 'lr_9_iter_tune.ckpt'
genre = 'travel'
mnli_pipeline(model_file,'travel')
# model = LogisticRegressionPyTorch(len(id2token), emb_dim, 3,'mul')
# model.load_state_dict(torch.load('model/lr_9_iter_tune.ckpt'))

36.25254582484725

In [14]:

# fiction = mnli_pipeline('model/lr_9_iter_tune.ckpt','fiction')
# slate = mnli_pipeline('model/lr_9_iter_tune.ckpt','slate')
# government = mnli_pipeline('model/lr_9_iter_tune.ckpt','government')
# telephone = mnli_pipeline('model/lr_9_iter_tune.ckpt','telephone')
# travel = mnli_pipeline('model/lr_9_iter_tune.ckpt','travel')
# l = [fiction,slate,government,telephone,travel]
# l