In [2]:
import json
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm_notebook
import numpy as np
import string
import time
from typing import List, Optional, Dict
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm_notebook 
from collections import defaultdict

#torch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

#torchtext.  Dataset is a custom file for handling QuizBowl data
#from dataset import QuizBowl
from torchtext.data.field import Field
from torchtext.data.iterator import Iterator

import dataset_confidence

In [3]:
class DAN(nn.Module):
    def __init__(self, embedding_dim, 
                 h1_dim, text_field, answer_size):
        super(DAN, self).__init__()
        
        #initialize the vocab from the text field (passed in from train_iter) and pad
        text_vocab = text_field.vocab
        self.text_vocab_size = len(text_vocab)
        text_pad_idx = text_vocab.stoi[text_field.pad_token]
        
        #run the vocab through Glove Embeddings
        self.text_embeddings = nn.Embedding(self.text_vocab_size, embedding_dim, padding_idx=text_pad_idx)
        self.text_field = text_field
        
        #set the unknown items to the mean embedding and make them cuda()
        mean_emb = text_vocab.vectors.mean(0)
        text_vocab.vectors[text_vocab.stoi[text_field.unk_token]] = mean_emb
        self.text_embeddings.weight.data = text_vocab.vectors.cuda()
        
        #freeze the embeddings
        self.text_embeddings.weight.requires_grad = False 
        
        #layers used in the network
        self.large_dropout = nn.Dropout(p=0.265)
        self.small_dropout = nn.Dropout(p=0.15)
        self.nonlinear = nn.Sigmoid()
        self.hidden = nn.Linear(embedding_dim, h1_dim)
        self.batch_norm = nn.BatchNorm1d(h1_dim)
        
        #the classifier converts the hidden dimensions into the answers.  
        #It takes batch norm and dropout as well.
        self.classifier = nn.Sequential(
            nn.Linear(h1_dim, answer_size),
            nn.BatchNorm1d(answer_size),
            nn.Dropout(.15)
        )

    def _pool(self, embed, lengths, batch_size):
        return embed.sum(1) / lengths.view(batch_size, -1)
    
    def forward(self, input_: Dict[str, Variable], lengths: Dict, qnums): 
        for key in lengths:
            if not isinstance(lengths[key], Variable):
                lengths[key] = Variable(lengths[key].float(), volatile=not self.training)
        
        #if the text exists, run it through embeddings, pool, dropout, and then run it through a hidden layer
        if self.text_field is not None:
            text_input = input_['text']
            embed = self.text_embeddings(text_input)
            averaged = self._pool(embed, lengths['text'].float(), text_input.size()[0])
            averaged_dropped = self.small_dropout(averaged)
            hidden_layer = self.hidden(averaged_dropped)
            batchnormed_dropped = self.large_dropout(self.batch_norm(hidden_layer))
            nonlinear = self.nonlinear(batchnormed_dropped)
            return self.classifier(nonlinear )

In [None]:
class ConfidenceLearner(nn.Module):
    def __init__(self, embeddings_dim, confidences_dim):
        super(ConfidenceLearner, self).__init__()
        self.transform = nn.Linear((embeddings_dim + confidences_dim), 1)
    
    def forward(self, embeds, confs):
        concat = torch.cat((embeds, confs), -1)
        data = torch.sigmoid(self.transform(concat))
        return data
    
    
#train with SimpleConfidenceLearner.  Then freeze embeddings and train ConfidenceLearner.
#loop through each parameter and set weights equal to it

class SimpleConfidenceLearner(nn.Module):
    def __init__(self, embeddings_dim, confidences_dim):
        #does nothing
        super(SimpleConfidenceLearner, self).__init__()
        pass
    
    def forward(self, embeds, confs):
        return confs.unsqueeze(2)
    
class DAN_Confidences(nn.Module):
    def __init__(self, embedding_dim, 
                 h1_dim, text_field, answer_size):
        super(DAN_Confidences, self).__init__()
        text_vocab = text_field.vocab
        self.text_vocab_size = len(text_vocab)
        text_pad_idx = text_vocab.stoi[text_field.pad_token]
        self.text_embeddings = nn.Embedding(self.text_vocab_size, embedding_dim, padding_idx=text_pad_idx)
        self.text_field = text_field

        mean_emb = text_vocab.vectors.mean(0)
        text_vocab.vectors[text_vocab.stoi[text_field.unk_token]] = mean_emb
        self.text_embeddings.weight.data = text_vocab.vectors.cuda()
        
        #self.text_embeddings.weight.requires_grad = False
        #freeze the embeddings
        
        #confidences are learned from word_embeddings and respective word_confidence
        self.confidences = SimpleConfidenceLearner(embedding_dim, 1)
        
        #layers used in the network
        self.large_dropout = nn.Dropout(p=0.265)
        self.small_dropout = nn.Dropout(p=0.15)
        
        self.nonlinear = nn.Sigmoid()
        self.hidden = nn.Linear(embedding_dim, h1_dim)
        self.batch_norm = nn.BatchNorm1d(h1_dim)
        
        self.classifier = nn.Sequential(
            nn.Linear(h1_dim, answer_size),  #make this h1_dim+1 when appeneded
            nn.BatchNorm1d(answer_size),
            nn.Dropout(.15)
        )
        
    def _pool(self, embed, lengths, batch_size):
        return embed.sum(1) / lengths.view(batch_size, -1)
    
    def forward(self, input_: Dict[str, Variable], lengths: Dict, qnums, confidences): 
        for key in lengths:
            if not isinstance(lengths[key], Variable):
                lengths[key] = Variable(lengths[key].float(), volatile=not self.training)
        
        if self.text_field is not None:
            text_input = input_['text']
            embed = self.text_embeddings(text_input)
            confidences = Variable(confidences).cuda()
            confidences = self.confidences(embed, confidences)
            multiplied = embed * confidences
            averaged = self._pool(multiplied, lengths['text'].float(), text_input.size()[0])
            averaged_dropped = self.small_dropout(averaged)
            hidden_layer = self.hidden(averaged_dropped)
            batchnormed_dropped = self.large_dropout(self.batch_norm(hidden_layer))
            nonlinear = self.nonlinear(batchnormed_dropped)
            return self.classifier(nonlinear )
        
        #layer.weights.data[:, -1] = pretrainedweights
        #layer.bias.data[: -1]

In [None]:
class DAN_Confidences_Softmax(nn.Module):
    def __init__(self, embedding_dim, 
                 h1_dim, text_field, answer_size):
        super(DAN_Confidences_Softmax, self).__init__()
        text_vocab = text_field.vocab
        self.text_vocab_size = len(text_vocab)
        text_pad_idx = text_vocab.stoi[text_field.pad_token]
        self.text_embeddings = nn.Embedding(self.text_vocab_size, embedding_dim, padding_idx=text_pad_idx)
        self.text_field = text_field

        mean_emb = text_vocab.vectors.mean(0)
        text_vocab.vectors[text_vocab.stoi[text_field.unk_token]] = mean_emb
        self.text_embeddings.weight.data = text_vocab.vectors.cuda()
        
        #self.text_embeddings.weight.requires_grad = False
        #freeze the embeddings
        
        #layers used in the network
        self.large_dropout = nn.Dropout(p=0.265)
        self.small_dropout = nn.Dropout(p=0.15)
        
        self.nonlinear = nn.Sigmoid()
        self.hidden = nn.Linear(embedding_dim, h1_dim)
        self.batch_norm = nn.BatchNorm1d(h1_dim)
        
        self.classifier = nn.Sequential(
            nn.Linear(h1_dim+1, answer_size),  #make this h1_dim+1 when appeneded
            nn.BatchNorm1d(answer_size),
            nn.Dropout(.15)
        )
        
    def _pool(self, embed, lengths, batch_size):
        return embed.sum(1) / lengths.view(batch_size, -1)
    
    def forward(self, input_: Dict[str, Variable], lengths: Dict, qnums, confidences): 
        for key in lengths:
            if not isinstance(lengths[key], Variable):
                lengths[key] = Variable(lengths[key].float(), volatile=not self.training)
        
        if self.text_field is not None:
            text_input = input_['text']
            embed = self.text_embeddings(text_input)
            #do this elsewhere
            confidences = Variable(confidences).cuda()                   
            averaged = self._pool(embed, lengths['text'].float(), text_input.size()[0])
            averaged_dropped = self.small_dropout(averaged)
            hidden_layer = self.hidden(averaged_dropped)  
            batchnorm_dropped = self.large_dropout(self.batch_norm(hidden_layer))
            nonlinear = self.nonlinear(batchnorm_dropped)
            expanded = torch.cat((nonlinear,confidences.mean(dim=1).unsqueeze(-1)), 1)
            return self.classifier(expanded)

In [None]:
class DAN_WeightedMean(nn.Module):
    def __init__(self, embedding_dim, 
                 h1_dim, text_field, answer_size):
        super(DAN_WeightedMean, self).__init__()
        text_vocab = text_field.vocab
        self.text_vocab_size = len(text_vocab)
        text_pad_idx = text_vocab.stoi[text_field.pad_token]
        self.text_embeddings = nn.Embedding(self.text_vocab_size, embedding_dim, padding_idx=text_pad_idx)
        self.text_field = text_field

        mean_emb = text_vocab.vectors.mean(0)
        text_vocab.vectors[text_vocab.stoi[text_field.unk_token]] = mean_emb
        self.text_embeddings.weight.data = text_vocab.vectors.cuda()
        
        #self.text_embeddings.weight.requires_grad = False
        #freeze the embeddings
        
        #layers used in the network
        self.large_dropout = nn.Dropout(p=0.265)
        self.small_dropout = nn.Dropout(p=0.15)
        
        self.nonlinear = nn.Sigmoid()
        self.hidden = nn.Linear(embedding_dim, h1_dim)
        self.batch_norm = nn.BatchNorm1d(h1_dim)
        
        #self.confidences = ConfidenceLearner(embedding_dim, 1) # ADDED IN FOR VARIATION
        
        self.classifier = nn.Sequential(
            nn.Linear(h1_dim, answer_size),
            nn.BatchNorm1d(answer_size),
            nn.Dropout(.15)
        )
        
    def _pool(self,  embed, lengths, confidences):
        embed = embed * confidences.unsqueeze(2).expand_as(embed) 
        embed = embed.sum(1)
        return embed / confidences.sum(dim = 1).unsqueeze(1).expand_as(embed)
                    
        #for learning variation
              #pass in output of CONFIDENCE LEARNER into POOOl
              #dimensions will be the same, just learning new value for confidence
    
    def forward(self, input_: Dict[str, Variable], lengths: Dict, qnums, confidences): 
        for key in lengths:
            if not isinstance(lengths[key], Variable):
                lengths[key] = Variable(lengths[key].float(), volatile=not self.training)
                
        if self.text_field is not None:
            text_input = input_['text']
            embed = self.text_embeddings(text_input)
            confidences = Variable(confidences).cuda() 
            #confidences = self.confidences(embed, confidences).squeeze() #ADDED IN FOR VARIATION
            averaged = self._pool(embed, lengths['text'].float(), confidences)
            averaged_dropped = self.small_dropout(averaged)
            hidden_layer = self.hidden(averaged_dropped)
            batchnorm_dropped = self.large_dropout(self.batch_norm(hidden_layer))
            nonlinear = self.nonlinear(batchnorm_dropped)
            return self.classifier(batchnorm_dropped)

In [None]:
class RNN(nn.Module):
    def __init__(self, embedding_dim, 
                 h1_dim, text_field, answer_size):
        super(RNN, self).__init__()
        #initialize the vocab from the text field (passed in from train_iter) and pad
        text_vocab = text_field.vocab
        self.text_vocab_size = len(text_vocab)
        text_pad_idx = text_vocab.stoi[text_field.pad_token]
        
        #run the vocab through Glove Embeddings
        self.text_embeddings = nn.Embedding(self.text_vocab_size, embedding_dim, padding_idx=text_pad_idx)
        self.text_field = text_field
        
        #set the unknown items to the mean embedding and make them cuda()
        mean_emb = text_vocab.vectors.mean(0)
        text_vocab.vectors[text_vocab.stoi[text_field.unk_token]] = mean_emb
        self.text_embeddings.weight.data = text_vocab.vectors.cuda()
        
        #freeze the embeddings
        #self.text_embeddings.weight.requires_grad = False 
        
        #initiate hidden dimensions and allow them to be referenced by init_hidden
        self.hidden_dim = h1_dim
        self.hidden = self.init_hidden()
        self.rnn = nn.GRU(embedding_dim, h1_dim)
        self.large_dropout = nn.Dropout(p=0.265)
        self.batch = nn.BatchNorm1d(h1_dim)
        self.nonlinearity = nn.Sigmoid()
        
        #the classifier converts the hidden dimensions into the answers.  
        self.classifier = nn.Sequential(
            nn.Linear(h1_dim, answer_size),
            nn.BatchNorm1d(answer_size),
            nn.Dropout(.15))
        
    
    def init_hidden(self, batch_size = 32):
        """
        Return variables that we can use as h_0 and c_0. 
        """
        return (Variable(torch.zeros(1, batch_size, self.hidden_dim ).cuda()),
                Variable(torch.zeros(1, batch_size, self.hidden_dim).cuda()))
 
    def forward(self, input_: Dict[str, Variable], lengths: Dict, qnums): 
        #Run text through embeddings, pool, dropout, and then run it through a hidden layer
        text_input = input_['text']
            
        embed = self.text_embeddings(text_input)
        lengths_cpu = lengths['text'].cpu()
        # pack the batch
        packed = pack_padded_sequence(embed, list(lengths_cpu),
                                      batch_first=True)
        out_packed, self.hidden = self.rnn(packed)     
        x = self.large_dropout(self.batch(self.hidden[0].squeeze()))
        x = self.classifier(x)
        return x

In [None]:
class RNN_Softmax(nn.Module):
    def __init__(self, embedding_dim, 
                 h1_dim, text_field, answer_size):
        super(RNN_Softmax, self).__init__()
        #initialize the vocab from the text field (passed in from train_iter) and pad
        text_vocab = text_field.vocab
        self.text_vocab_size = len(text_vocab)
        text_pad_idx = text_vocab.stoi[text_field.pad_token]
        
        #run the vocab through Glove Embeddings
        self.text_embeddings = nn.Embedding(self.text_vocab_size, embedding_dim, padding_idx=text_pad_idx)
        self.text_field = text_field
        
        #set the unknown items to the mean embedding and make them cuda()
        mean_emb = text_vocab.vectors.mean(0)
        text_vocab.vectors[text_vocab.stoi[text_field.unk_token]] = mean_emb
        self.text_embeddings.weight.data = text_vocab.vectors.cuda()
        
        #freeze the embeddings
        #self.text_embeddings.weight.requires_grad = False 
        
        #initiate hidden dimensions and allow them to be referenced by init_hidden
        self.hidden_dim = h1_dim
        self.hidden = self.init_hidden()
        self.rnn = nn.GRU(embedding_dim, h1_dim)
        self.large_dropout = nn.Dropout(p=0.265)
        self.batch = nn.BatchNorm1d(h1_dim)
        self.nonlinearity = nn.Sigmoid()
        
        #the classifier converts the hidden dimensions into the answers.  
        self.classifier = nn.Sequential(
            nn.Linear(h1_dim+1, answer_size),
            nn.BatchNorm1d(answer_size),
            nn.Dropout(.15))
        
    
    def init_hidden(self, batch_size = 32):
        """
        Return variables that we can use as h_0 and c_0. 
        """
        return (Variable(torch.zeros(1,batch_size, self.hidden_dim).cuda()),
                Variable(torch.zeros(1, batch_size, self.hidden_dim).cuda()))
 
    def forward(self, input_: Dict[str, Variable], lengths: Dict, qnums, confidences): 
        
        confidences = Variable(confidences).cuda()                   
        #Run text through embeddings, pool, dropout, and then run it through a hidden layer
        text_input = input_['text']

        embed = self.text_embeddings(text_input)
        lengths_cpu = lengths['text'].cpu()
         # pack the batch
        packed = pack_padded_sequence(embed, list(lengths_cpu),
                                      batch_first=True)
        out_packed, self.hidden = self.rnn(packed)
         
        x = self.large_dropout(self.batch(self.hidden[0].squeeze()))
        expanded = torch.cat((x,confidences.mean(dim=1).unsqueeze(-1)), 1)
        x = self.classifier(expanded)
        return x

In [None]:
class RNN_Confidences(nn.Module):
    def __init__(self, embedding_dim, 
                 h1_dim, text_field, answer_size):
        super(RNN_Confidences, self).__init__()
        #initialize the vocab from the text field (passed in from train_iter) and pad
        text_vocab = text_field.vocab
        self.text_vocab_size = len(text_vocab)
        text_pad_idx = text_vocab.stoi[text_field.pad_token]
        
        #run the vocab through Glove Embeddings
        self.text_embeddings = nn.Embedding(self.text_vocab_size, embedding_dim, padding_idx=text_pad_idx)
        self.text_field = text_field
        
        #set the unknown items to the mean embedding and make them cuda()
        mean_emb = text_vocab.vectors.mean(0)
        text_vocab.vectors[text_vocab.stoi[text_field.unk_token]] = mean_emb
        self.text_embeddings.weight.data = text_vocab.vectors.cuda()
        
        #confidences are learned from word_embeddings and respective word_confidence
        self.confidences = SimpleConfidenceLearner(embedding_dim, 1)
    
        #freeze the embeddings
        #self.text_embeddings.weight.requires_grad = False 
        
        #initiate hidden dimensions and allow them to be referenced by init_hidden
        self.hidden_dim = h1_dim
        self.hidden = self.init_hidden()
        self.rnn = nn.LSTM(embedding_dim, h1_dim)
        self.large_dropout = nn.Dropout(p=0.265)
        self.batch = nn.BatchNorm1d(h1_dim)
        self.nonlinearity = nn.Sigmoid()
        
        #the classifier converts the hidden dimensions into the answers.  
        self.classifier = nn.Sequential(
            nn.Linear(h1_dim, answer_size),
            nn.BatchNorm1d(answer_size),
            nn.Dropout(.15))
        
    
    def init_hidden(self, batch_size = 32):
        """
        Return variables that we can use as h_0 and c_0. 
        """
        return (Variable(torch.zeros(1,batch_size, self.hidden_dim).cuda()),
                Variable(torch.zeros(1, batch_size, self.hidden_dim).cuda()))
 
    def forward(self, input_: Dict[str, Variable], lengths: Dict, qnums, confidences): 

        #Run text through embeddings, pool, dropout, and then run it through a hidden layer
        text_input = input_['text']
                 
        embed = self.text_embeddings(text_input)
        
        confidences = Variable(confidences).cuda()
        confidences = self.confidences(embed, confidences)
        multiplied = embed * confidences
        
        lengths_cpu = lengths['text'].cpu()
        # pack the batch
        packed = pack_padded_sequence(multiplied, list(lengths_cpu),
                                      batch_first=True)
        out_packed, self.hidden = self.rnn(packed)
        x = self.large_dropout(self.batch(self.hidden[0].squeeze()))
        x = self.classifier(x)
        return x

In [4]:
import importlib
importlib.reload(dataset_confidence)
train_iter_asr, val_iter_asr, dev_iter_asr = dataset_confidence.QuizBowl.iters(
            batch_size=512,
            lower= True,
            use_wiki=False,  #irrelevant
            n_wiki_sentences=5, #irrelevant 
            replace_title_mentions='',
            combined_ngrams=True,
            unigrams=True, 
            bigrams=False, #irrelevant 
            trigrams=False, #irrelevant 
            combined_max_vocab_size=300000,
            unigram_max_vocab_size= None, 
            bigram_max_vocab_size=50000, #irrelevant 
            trigram_max_vocab_size=50000 #irrelevant 
        )

In [None]:
def run_epoch(iterator: Iterator, CONFIDENCE = False):
        is_train = iterator.train
        batch_accuracies = []
        batch_losses = []
        epoch_start = time.time()
        for batch in iterator:
            input_dict = {}
            lengths_dict = {}
            if hasattr(batch, 'text'):
                text, lengths = batch.text             
                input_dict['text'] = text
                lengths_dict['text'] = lengths

            page = batch.page    
            qnums = batch.qnum.cuda()

            if is_train:
                model.zero_grad()
            
            if CONFIDENCE:
                confidences = batch.confidence
                out = model(input_dict, lengths_dict, qnums, confidences)
            else:
                out = model(input_dict, lengths_dict, qnums)
                
            _, preds = torch.max(out, 1)
      
            accuracy = torch.mean(torch.eq(preds, page).float()).data[0]           
            batch_loss = loss_function(out, page)
            if is_train:
                batch_loss.backward()
                torch.nn.utils.clip_grad_norm(model.parameters(), .25)
                optimizer.step()

            batch_accuracies.append(accuracy)
            batch_losses.append(batch_loss.data[0])

        epoch_end = time.time()

        return np.mean(batch_accuracies), np.mean(batch_losses), epoch_end - epoch_start

In [None]:
#Set the dimensions and epochs for model
EMBEDDING_DIM = 300
HIDDEN_DIM = 1000
EPOCH = 50

#extract fields to determine vocabulary size of answers
#UPDATE THIS BETWEEN CLEAN AND ASR
fields: Dict[str, Field] = train_iter_asr.dataset.fields
page_field = fields['page']
ANSWER_SIZE = len(page_field.vocab.stoi)
 
model = DAN(EMBEDDING_DIM,
             HIDDEN_DIM,
             fields['text'], 
             ANSWER_SIZE)
model = model.cuda()

loss_function = nn.CrossEntropyLoss()

for i in tqdm_notebook(range(EPOCH)):  
    scheduler.step()
    #train
    model.train()
    train_acc, train_loss, train_time = run_epoch(train_iter_asr)
    train_losses.append(train_loss)
    train_accuracies.append(train_acc)

    #validate
    model.eval()
    val_acc, val_loss, val_time = run_epoch(val_iter_asr)    
    val_losses.append(val_loss)
    val_accuracies.append(val_acc)
    print (val_acc, val_loss)

In [None]:
plt.title("Model Accuracies")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.plot(val_accuracies)

In [None]:
model.eval()
test_acc, test_loss, test_time= run_epoch(dev_iter_asr)
test_acc