### 3.2 Evaluating on MultiNLI (Best Logistic Regression Model)
**ONLY VALIDATION DATA ARE USED**

In [1]:
import torch
import numpy as np
from collections import Counter
from torch.utils.data import Dataset
import torch.nn as nn
import torch.nn.functional as F
import pickle as pkl
import os

In [2]:
class NewsGroupDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """

    def __init__(self, hypo_list, prem_list, target_list, max_sentence_length):
        """
        @param hypo_list: list of hypo tokens
        @param prem_list: list of prem tokens
        @param target_list: list of newsgroup targets
        @param max_sentence_length: fixed length of all sentence

        """
        self.hypo_list = hypo_list
        self.prem_list = prem_list
        self.target_list = target_list
        self.max_sentence_length = max_sentence_length
        assert (len(self.hypo_list) == len(self.target_list))
        assert (len(self.prem_list) == len(self.target_list))


    def __len__(self):
        return len(self.hypo_list)

    def __getitem__(self, key):
        """
            Triggered when you call dataset[i]
            """

        token_hypo_idx = self.hypo_list[key][:self.max_sentence_length]
        token_prem_idx = self.prem_list[key][:self.max_sentence_length]
        label = self.target_list[key]
        return [token_hypo_idx, len(token_hypo_idx), token_prem_idx, len(token_prem_idx), label]


# Function for testing the model
class NeuralNetworkPytorch(nn.Module):
    """
    NeuralNetwork classification model
    Model would change according to interaction_type

    1st hidden layer: 90 neurons
    2nd hidden layer: 90 neurons
    """
    def __init__(self, vocab_size, emb_dim, n_out, interaction_type):
        """
        @param vocab_size: size of the vocabulary.
        @param emb_dim: size of the word embedding
        @param n_out: size of the class.
        """
        super(NeuralNetworkPytorch, self).__init__()

        # 1. Embedding
        self.embed = nn.Embedding(vocab_size, emb_dim, padding_idx=0)

        # 2. an affine operation: y=Wx+b
        # double embedding dimension if we concat hypo's and prem's embedding
        if interaction_type == 'concat':
            emb_dim = 2 * emb_dim
        self.hidden_1= nn.Linear(emb_dim,90)
        self.hidden_2=nn.Linear(90, 90)
        self.output = nn.Linear(90, n_out)

    def forward(self, data_hypo, length_hypo, data_prem, length_prem, interaction_type):
        """
            @param data: matrix of size (batch_size, max_sentence_length). Each row in data represents a
            review that is represented using n-gram index. Note that they are padded to have same length.
            @param length: an int tensor of size (batch_size), which represents the non-trivial (excludes padding)
            length of each sentences in the data.
            @param data_prem: matrix of size (batch_size, max_sentence_length).
            @param length_hypo: an int tensor of size (batch_size), which represents the non-trivial (excludes padding)
                length of each sentences in the data_prem.
            @param interaction_type: [sum. hadamart, concat]
            """
        # word embedding
        out_hypo = self.embed(data_hypo)
        out_prem = self.embed(data_prem)
        # combine to sentence
        out_prem = torch.sum(out_prem, dim=1)
        out_hypo = torch.sum(out_hypo, dim=1)
        out_prem /= length_prem.view(length_prem.size()[0], 1).expand_as(out_prem).float()
        out_hypo /= length_hypo.view(length_hypo.size()[0], 1).expand_as(out_hypo).float()

        # interaction
        # 1. sum
        # 2. Hadamard product
        # 3. concat (This will change embedding dimension, 2 times as many as before)
        if interaction_type == 'concat':
            out = torch.cat((out_hypo,out_prem),1)
        if interaction_type == 'sum':
            out = torch.add(out_hypo, out_prem)
        if interaction_type == 'hadamart':
            out = out_hypo * out_prem

        out = self.hidden_1(out.float())
        out = F.relu(out)
        out = self.hidden_2(out)
        out = F.relu(out)
        out = self.output(out)
        return out

class LogisticRegressionPyTorch(nn.Module):
    """
    Logistic regression classification model
    Model would change according to interaction_type
    """
    def __init__(self, vocab_size, emb_dim, n_out, interaction_type):
        """
        @param vocab_size: size of the vocabulary.
        @param emb_dim: size of the word embedding.
        @param n_out: size of the class.
        """
        super(LogisticRegressionPyTorch, self).__init__()

        # 1. Embedding
        self.embed = nn.Embedding(vocab_size, emb_dim, padding_idx=0)

        # 2. Logistic Regression
        # double embedding dimension if we concat hypo's and prem's embedding
        if interaction_type == 'concat':
            emb_dim *= 2
        self.linear = nn.Linear(emb_dim, n_out)

    def forward(self, data_hypo, length_hypo, data_prem, length_prem, interaction_type):
        """
        @param data_hypo: matrix of size (batch_size, max_sentence_length).
        @param length_hypo: an int tensor of size (batch_size), which represents the non-trivial (excludes padding)
            length of each sentences in the data_hypo.
        @param data_prem: matrix of size (batch_size, max_sentence_length).
        @param length_hypo: an int tensor of size (batch_size), which represents the non-trivial (excludes padding)
            length of each sentences in the data_prem.
        @param interaction_type: [sum. hadamart, concat]
        """
        out_hypo = self.embed(data_hypo)
        out_prem = self.embed(data_prem)

        out_hypo = torch.sum(out_hypo, dim=1)
        out_prem = torch.sum(out_prem, dim=1)

        out_hypo /= length_hypo.view(length_hypo.size()[0],1).expand_as(out_hypo).float()
        out_prem /= length_prem.view(length_prem.size()[0],1).expand_as(out_prem).float()

        # interaction
        # 1. sum
        # 2. Hadamard product
        # 3. concat (This will change embedding dimension, 2 times as many as before)
        if interaction_type == 'concat':
            out = torch.cat((out_hypo,out_prem),1)
        if interaction_type == 'sum':
            out = torch.add(out_hypo,out_prem)
        if interaction_type == 'hadamard':
            out = out_hypo * out_prem

        # return logits
        out = self.linear(out.float())
        return out    

def newsgroup_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all
    data have the same length
    """
    hypo_list = []
    len_hypo_list = []
    prem_list = []
    len_prem_list = []
    label_list = []

    #print("collate batch: ", batch)
    #batch[0][0] = batch[0][0][:max_sentence_length_prem]
    for datum in batch:
        label_list.append(datum[4])
        len_hypo_list.append(datum[1])
        len_prem_list.append(datum[3])
    # padding
    for datum in batch:
        # hypo
        padded_vec = np.pad(np.array(datum[0]), pad_width=((0,max_sentence_length-datum[1])), mode="constant", constant_values=0)
        hypo_list.append(padded_vec)
        # prem
        padded_vec = np.pad(np.array(datum[2]), pad_width=((0,max_sentence_length-datum[3])), mode="constant", constant_values=0)
        prem_list.append(padded_vec)
    return [torch.from_numpy(np.array(hypo_list)), torch.LongTensor(len_hypo_list), torch.from_numpy(np.array(prem_list)), torch.LongTensor(len_prem_list),torch.LongTensor(label_list)]

def test_model(data_loader, model, interaction_type):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against

    return:
    accuracy, loss
    """
    correct = 0
    total = 0
    model.eval()

    for i, (data_hypo, lengths_hypo, data_prem, lengths_prem, labels) in enumerate(data_loader):
        outputs = model(data_hypo, lengths_hypo, data_prem, lengths_prem, interaction_type)
        # compute loss
        loss = criterion(outputs, labels)
        # compute acc
        outputs_softmax = F.softmax(outputs, dim=1)
        predicted = outputs_softmax.max(1, keepdim=True)[1]
        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total), loss.item()


# convert token to id in the dataset
def token2index(tokens_data):
    indices_data = []
    for tokens in tokens_data:
        index_list = [token2id[token] if token in token2id else UNK_IDX for token in tokens]
        indices_data.append(index_list)
    return indices_data

def build_vocab(hypo_tokens, prem_tokens, max_vocab_size):
    # Returns:
    # id2token: list of tokens, where id2token[i] returns token that corresponds to token i
    # token2id: dictionary where keys represent tokens and corresponding values represent indices

    hypo_token_counter = Counter(hypo_tokens)
    prem_token_counter = Counter(prem_tokens)

    all_tokens_counter = hypo_token_counter + prem_token_counter

    vocab, count = zip(*all_tokens_counter.most_common(max_vocab_size))

    # print(all_tokens_counter.most_common(MAX_VOCAB_SIZE))

    id2token = list(vocab)
    token2id = dict(zip(vocab, range(2, 2 + len(vocab))))
    id2token = ['<pad>', '<unk>'] + id2token
    token2id['<pad>'] = PAD_IDX
    token2id['<unk>'] = UNK_IDX
    return token2id, id2token

In [3]:
UNK_IDX = 0
PAD_IDX = 1
max_sentence_length = 20
# this is the paramter of the best nn model
max_vocab_size = 10000
BATCH_SIZE = 1024
emb_dim = 50
interaction_type = 'hadamard'

In [4]:
genres = ['fiction', 'travel', 'government', 'slate', 'telephone']
folder = os.getcwd() + '/../all_data_pickle/'
all_hypo_data_tokens_train = pkl.load(open(folder+"all_hypo_data_tokens_train.p", "rb"))
all_prem_data_tokens_train = pkl.load(open(folder+"all_prem_data_tokens_train.p", "rb"))
token2id, id2token = build_vocab(all_hypo_data_tokens_train, all_prem_data_tokens_train, max_vocab_size)
criterion = torch.nn.CrossEntropyLoss()

mult_genre_val_acc = {}

for genre in genres:
    hypo_data_tokens_val = pkl.load(open(folder+"hypo_data_tokens_val_{}.p".format(genre), "rb"))
    prem_data_tokens_val = pkl.load(open(folder+"prem_data_tokens_val_{}.p".format(genre), "rb"))
    label_index_val = pkl.load(open(folder+"label_index_val_{}.p".format(genre), "rb"))
    hypo_data_indices_val = token2index(hypo_data_tokens_val)
    prem_data_indices_val= token2index(prem_data_tokens_val)
    val_dataset = NewsGroupDataset(hypo_data_indices_val, prem_data_indices_val, label_index_val,max_sentence_length)
    val_loader = torch.utils.data.DataLoader(dataset=val_dataset, batch_size=BATCH_SIZE, collate_fn=newsgroup_collate_func, shuffle=True)
    #initialize model
    model = LogisticRegressionPyTorch(len(id2token), emb_dim, len(set(label_index_val)), interaction_type)
    best_model_save = '/Users/ludi/Desktop/tars/best_checkpoint5_log.tar'
    checkpoint = torch.load(best_model_save)
    # load the pretrained model
    model.load_state_dict(checkpoint['state_dict'])
    print('Validating on {} genre...'.format(genre))
    val_acc, val_loss = test_model(val_loader, model, interaction_type)
    mult_genre_val_acc[genre] = val_acc
    print('The best logistic model\'s validation accuracy on {} genre is {}.'.format(genre, np.around(val_acc,2)))

pkl.dump(mult_genre_val_acc, open("3_2_Best_Logistic_Model_Evaluation_On_MNLI.p", "wb"))

Validating on fiction genre...
The best logistic model's validation accuracy on fiction genre is 41.91.
Validating on travel genre...
The best logistic model's validation accuracy on travel genre is 37.88.
Validating on government genre...
The best logistic model's validation accuracy on government genre is 36.12.
Validating on slate genre...
The best logistic model's validation accuracy on slate genre is 37.62.
Validating on telephone genre...
The best logistic model's validation accuracy on telephone genre is 40.3.
