In [43]:
import torch
import json
import torch.nn as nn
import numpy as np
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torch.utils.data.sampler import SubsetRandomSampler
import gensim #For word2vec
from nltk.corpus import stopwords
from gensim.models import Word2Vec
import time
import nltk
import random
from numpy.random import choice as randomchoice
from torch.nn.utils.rnn import pad_sequence
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import sys
# import torchtext
import pickle
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [44]:
import gensim.downloader as api
import torchtext.vocab as vocab
glove_model = api.load("glove-wiki-gigaword-200")

In [45]:

train_start_time = time.time();
# val_file = sys.argv[2];
train_file = 'data/train.json'
val_file = 'data/dev.json'
test_file = 'data/test.json'

tokenize_func = nltk.tokenize.WordPunctTokenizer().tokenize
punctuations = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
def is_numeric(s):
    try:
        float(s)
        return True
    except ValueError: #Classic way to get is_numeric
        return False
def tokenize(sentence, with_num = False):
    old = tokenize_func(sentence.lower());
    s = [];
    for word in old:
        running = [];
        for character in word:
            if(character in punctuations):
                if(len(running) > 0):
                    s.append(''.join(running));
                    running = []; #emptying the running list.
                s.append(character); #then adding the punctuation.
            else:
                running.append(character);
        if(len(running) > 0):
            s.append(''.join(running));
        #this above code ensures that what we have is also split on punctuation
    if(with_num):
        return s; #If with_num is true, return the sentence as it is, without converting the numbers to <NUM>
    for i in range(len(s)):
        if(is_numeric(s[i])):
            s[i] = '<NUM>'; #replaces numbers with <NUM>
    return s;

def tokenize_with_num(sentence): #just tokenizes normally. No replacement of numbers
    s = tokenize_func(sentence.lower());
    return s;

def get_embedding_index(sentences, model):
    return ([tokenize_and_get_embedding_index(sentence, model) for sentence in sentences]);

def tokenize_and_get_embedding_index(sentence, vocab, with_num = False):
    s = tokenize(sentence, with_num = with_num);
    # FOr now testing with No UNK, Later will have to add UNK
    tens = torch.tensor([vocab.get(word, vocab['<UNK>']) for word in s]) # if (word in vocab)]); #if the word is not in the punctuation, only then we add it.
    return tens;
    if(len(tens) == 0):
        return torch.tensor([vocab.get(word, vocab['<UNK>']) for word in s]) #using UNK in this case.
    else:
        return tens;

In [46]:
with open(train_file) as f:
    train_data = json.load(f)
with open(val_file) as f:
    val_data = json.load(f)
with open(test_file) as f:
    test_data = json.load(f)

In [47]:
train_data[1]['Problem']

'sophia finished 2 / 3 of a book . she calculated that she finished 90 more pages than she has yet to read . how long is her book ?'

In [48]:
## Postprocessing the code to remove the last '|' that is sometimes randomly present.
def remove_last_extra(data):
    for i in range(len(data)):
        if(data[i]['linear_formula'][-1] == '|'):
            data[i]['linear_formula'] = data[i]['linear_formula'][:-1];
        
    return data; #although not really needed.
remove_last_extra(val_data);
remove_last_extra(train_data);
remove_last_extra(test_data);

In [53]:
vocabulary = set(['<START>', '<END>', '<PAD>', '<UNK>', '<NUM>']);
for data in train_data:
    for word in tokenize(data['Problem']):
        vocabulary.add(word);
    for word in tokenize(data['linear_formula']):
        vocabulary.add(word);
vocab_word_to_index = {word: i for i, word in enumerate(vocabulary)};
vocab_index_to_word = {i: word for i, word in enumerate(vocabulary)};
rand_count = 0;
def get_word_embedding(word, glove_vectors, dim):
    global rand_count;
    if word in glove_vectors.key_to_index: #if the key is present we initialize it as glove embedding
        return torch.tensor(glove_vectors[word])
    else:
        rand_count += 1;
        return torch.rand(dim)  # Initialize randomly for out-of-vocabulary words
start_index = vocab_word_to_index['<START>'];
pad_index = vocab_word_to_index['<PAD>'];
end_index = vocab_word_to_index['<END>'];
unk_index = vocab_word_to_index['<UNK>'];
num_index = vocab_word_to_index['<NUM>'];

In [54]:
wordvec = [0] * len(vocabulary); #initializing the wordvec list
rand_count = 0;
for i in range(len(vocabulary)):
    wordvec[i] = get_word_embedding(vocab_index_to_word[i], glove_model, 200);
wordvec = torch.stack(wordvec); #stacking the list of tensors to form a tensor.
print("Random count is ", rand_count, " out of ", len(vocabulary));

Random count is  427  out of  7408


In [55]:
with open('data/mysaved.pkl', 'wb') as f:
    pickle.dump([vocab_word_to_index, vocab_index_to_word, wordvec], f);

In [217]:
class LSTM_on_words(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, wordvectors, padding_index,bidirectional=True, dropout=0.0):
        super(LSTM_on_words, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(wordvectors), padding_idx=padding_index,freeze=True).to(device);
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, dropout=dropout, batch_first=True, bidirectional=bidirectional).to(device);

    def forward(self, x, x_lengths):
        # Embedding
        out = self.embedding(x)
        # Pack padded sequence
        # lengths = x_lengths.detach().cpu().numpy();
        out = pack_padded_sequence(out, x_lengths, batch_first=True, enforce_sorted=False).to(device);
        out, (hidden, cell) = self.lstm(out)
        # Unpack packed sequence
        out, _ = pad_packed_sequence(out, batch_first=True)
        return out, (hidden, cell);

class FeedForward(nn.Module):
    def __init__(self, input_size, layer_sizes):
        super(FeedForward, self).__init__()
        self.layers = [];
        self.ReLU = nn.ReLU(inplace=False)
        for i in range(len(layer_sizes)):
            if(i == 0):
                self.layers.append(nn.Linear(input_size, layer_sizes[i]));
            else:
                self.layers.append(nn.Linear(layer_sizes[i-1], layer_sizes[i]));
            if(i != len(layer_sizes) - 1): #add Relu only if its not the last layer, since that is the output layer that we will softmax over.
                self.layers.append(self.ReLU);
        self.all_layers = nn.Sequential(*self.layers)
    def forward(self, x):
        out = self.all_layers(x)
        return out

In [218]:
class mathDataset(Dataset):
    def __init__(self, data, vocab_word_to_index, vocab_index_to_word, wordvec):
        self.data = data;
        self.vocab_word_to_index = vocab_word_to_index;
        self.vocab_index_to_word = vocab_index_to_word;
        self.wordvec = wordvec;
    def __len__(self):
        return len(self.data);
    def __getitem__(self, idx):
        problem = self.data[idx]['Problem'] + " <END>";
        linear_formula = self.data[idx]['linear_formula'] + " <END>"; #maybe the linear formula can go directly without getting emebdded as well.
        problem = tokenize_and_get_embedding_index(problem, self.vocab_word_to_index);
        linear_formula = tokenize_and_get_embedding_index(linear_formula, self.vocab_word_to_index);
        return problem, linear_formula;

def collate_fn(data):
    # data.sort(key=lambda x: len(x[0]), reverse=True)
    problems, linear_formulas = zip(*data)
    # problems = data; #zip(*data)
    problems_lengths = [len(problem) for problem in problems]
    # linear_formulas = pad_sequence
    problems = pad_sequence(problems, batch_first=True, padding_value=vocab_word_to_index['<PAD>'])
    linear_formulas = pad_sequence(linear_formulas, batch_first=True, padding_value=vocab_word_to_index['<PAD>'])
    return problems, problems_lengths, linear_formulas;


In [219]:
batch_size = 32;
dataloader = DataLoader(mathDataset(train_data, vocab_word_to_index, vocab_index_to_word, wordvec), batch_size=batch_size, shuffle=True, collate_fn=collate_fn);

In [220]:
encoder = LSTM_on_words(200, 200, 2, wordvec, vocab_word_to_index['<PAD>'], bidirectional=True).to(device)

In [238]:
#so our encoder is simply LSTM_on_words. Now to make the decoder LSTM_on_words.
class Decoder_LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, wordvectors, padding_index, dropout=0.0):
        super(Decoder_LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(wordvectors), padding_idx=padding_index,freeze=True).to(device);
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, dropout=dropout, batch_first=True).to(device);
        self.fc = FeedForward(hidden_size,[len(wordvectors)//2,len(wordvectors)]).to(device);
    def forward(self, batch_size,max_len, hidden, cell):
        dec_in = torch.tensor([vocab_word_to_index['<START>']] * batch_size).unsqueeze(1).to(device);
        #dec_in = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(start_index);
        outputs = [];
        for i in range(max_len):
            dec_out, (hidden, cell) = self.forward_step(dec_in,hidden, cell); #we get the value after one step of the LSTM.
            outputs.append(dec_out);
            _, ind = dec_out.topk(1);
            dec_in = ind.squeeze(-1).detach(); #squeezing is necessary because there will be an extra dimension here. Detaching it from the next step.
            break;
        torch.cat(outputs, dim=1)
        return outputs, (hidden, cell)
    
    def forward_step(self, inputs, hidden, cell):
        outs = self.embedding(inputs);
        outs, (h, c)  = self.lstm(outs, (hidden, cell));
        outs = self.fc(outs); #
        return outs, (h, c);
        

In [239]:
decoder = Decoder_LSTM(200, 400, 1, wordvec, vocab_word_to_index['<PAD>']).to(device)

In [240]:
# outs, (h, c) = decoder(batch_size,enc_out.shape[1], hidden, c_enc);

In [242]:
for i, (problems, problems_lengths, linear_formulas) in enumerate(dataloader):
    problems = problems.to(device);
    # problems_lengths = torch.tensor(problems_lengths).to(device);
    linear_formulas = linear_formulas.to(device);
    enc_out, (h_enc, c_enc) = encoder(problems, problems_lengths);
    hidden = h_enc.view(h_enc.shape[0]//2, 2, h_enc.shape[1], -1)[-1];
    hidden = torch.cat((hidden[0], hidden[1]), dim=-1).unsqueeze(0); #reverse and forward direction.
    cell = c_enc.view(c_enc.shape[0]//2, 2, c_enc.shape[1], -1)[-1];
    cell = torch.cat((cell[0], cell[1]), dim=-1).unsqueeze(0); #reverse and forward direction.
    outs, (h, c) = decoder(batch_size,enc_out.shape[1], hidden, cell);
    #outs = decoder(batch_size,enc_out.shape[1], hidden, cell);
    #the first type of decoder does not have any attention system, so what it will do is simply take the last hidden state of the encoder and decipher it further based on that.
    
    break;

In [233]:
torch.cat(outs, dim=1)

tensor([[ 0.0036, -0.0033,  0.0361,  ..., -0.0204,  0.0054,  0.0127],
        [ 0.0072,  0.0038,  0.0342,  ..., -0.0235,  0.0066,  0.0128],
        [ 0.0057, -0.0016,  0.0341,  ..., -0.0192,  0.0080,  0.0130],
        ...,
        [ 0.0045,  0.0014,  0.0304,  ..., -0.0213,  0.0028,  0.0141],
        [ 0.0064,  0.0013,  0.0310,  ..., -0.0229,  0.0049,  0.0171],
        [ 0.0073,  0.0035,  0.0326,  ..., -0.0199,  0.0030,  0.0130]],
       device='cuda:0', grad_fn=<SqueezeBackward1>)

In [162]:
x=torch.rand((2,3,4))
x

tensor([[[0.6383, 0.5898, 0.7385, 0.9220],
         [0.1873, 0.0257, 0.0542, 0.7946],
         [0.6523, 0.4925, 0.2044, 0.7247]],

        [[0.9574, 0.5743, 0.0913, 0.9562],
         [0.6762, 0.9488, 0.2844, 0.8574],
         [0.0997, 0.9355, 0.6662, 0.8578]]])

In [168]:
x[-1,:,0]

tensor([0.9574, 0.6762, 0.0997])

In [169]:
x.view(2,4,3)[-1][0]

tensor([0.9574, 0.5743, 0.0913])