## Dependencies - helpful Python tools, PyTorch

In [1]:
import json
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm_notebook
import numpy as np
import string
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

## Data Ingestion

In [None]:
#data created earlier in another script
#format: [ [[w1, c1], [w2, c2]...] , answer]

with open('word_confidences.json') as fp:
    data = json.load(fp)
data = data['data']    
X = [d[0] for d in data]
y = [d[1] for d in data]
X_train_raw = X[:4000]
y_train = y[:4000]
X_test_raw = X[4000:5000]
y_test = y[4000:5000]

#add shuffling

## Helper Functions - read Glove Embeddings and convert words into Torch Tensor

In [None]:
def read_embeddings(input_file='pytorch-reading-group/data/glove.6B.50d.txt', length=50):
    vectors, word_to_ix, ix_to_word = [], {}, []
    with open(input_file) as fp:
        for line in fp:
            # get the word and the vector
            word, *vector = line.split()
            vector = [float(i) for i in vector]
            # ensure that the vector is the correct length
            if len(vector) != length: continue
            word_to_ix[word] = len(ix_to_word) 
            ix_to_word.append(word)
            vectors.append(vector)
    return np.array(vectors), word_to_ix, ix_to_word

vectors, word_to_ix, ix_to_word = read_embeddings()
vectors = torch.from_numpy(vectors).float()

def prepare_sequence(seq, to_ix):
    idxs = []
    for w in seq:    
        if w in to_ix:
            idxs.append(to_ix[w])

    tensor = torch.LongTensor(idxs)
    return Variable(tensor)

## Model

In [None]:
class ConfidenceLearner(nn.Module):
    def __init__(self, n_embeddings, n_confidences):
        self.transform = nn.Linear(embeddings_dim + confidences_dim, 1)
    
    def forward(self, embeds, confs):
        concat = torch.cat(embeds, confs)
        return sigmoid(self.transform(concat)) #concat?

In [None]:
class DAN(nn.Module):
    def __init__(self, embedding_dim, 
                 h1_dim, h2_dim vocab_size, answer_size):
        super(DAN, self).__init__()
        #load embeddings from GloVe and freez them
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.word_embeddings.weight.data = vectors
        self.word_embeddings.weight.requires_grad = False
        
        #confidences are learned from word_embeddings and respective word_confidence
        self.confidences = ConfidenceLearner()
        
        self.h1 = nn.Linear(h1_dim, h2_dim)
        self.h2 = nn.Linear(h2_dim, answer_size)
    
    def forward(self, x):
        idx, confs = x
        embeds = self.word_embeddings(x)
        confidence = self.confidences(self.word_embeddings, confs)
        multiplied = [embed * confidence for embed in embeds]
        hidden = self.h1(multiplied)
        answers = self.h2(hidden)
        score = F.log_softmax(answers, dim=1)
        return score

## Train

In [None]:
EMBEDDING_DIM = 50 #update to 300 later
HIDDEN_DIM = 300 #Pedro used 1000 but only one layer
EPOCH = 15
model = DAN(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(possible_answers))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=0.003)
losses = []

for i in range(EPOCH):
    for index, sentence in tqdm_notebook(enumerate(X_train_raw), total = len(X_train_raw)):
        model.zero_grad()
        model.hidden = model.init_hidden()
        inputs = prepare_sequence(sentence, word_to_ix)

        #will this be issue?  Will answer be in GloVe?  If not what happens?
        correct_output = prepare_sequence(y_train[index, word_to_ix)

        #input should be in format [[1x1]confidence*[1x50]embedding]
        output = model (inputs)
        loss = loss_function(output, correct_output)
        loss.backward()
        optimizer.step()