In [1]:
import torch
from torch import nn, optim
from torch.autograd import Variable as var
from torch.nn import functional as F
import torchtext.vocab as vocab
from tqdm import tqdm
from pprint import pprint
import json
import _pickle as pkl
import os
from multiprocessing import Pool, Manager
from time import time

In [2]:
with open('../data/data.json', 'r') as f:
    data = json.load(f)

def clean(token):
    cleaned_token = token.strip(".,?!-:;'()[]\"`")
    if cleaned_token[-2:] == "'s":
        cleaned_token = cleaned_token[:-2]
    if cleaned_token[-2:] == "'t":
        cleaned_token = cleaned_token[:-2]+'t'
    return cleaned_token

def tokenize(input_txt):
    return [w for w in input_txt.split(" ") if len(clean(w).strip())]
        
def token_idx_map(input_txt):
    input_seq = tokenize(input_txt)
    tok_idx_map = {"start": {}, "end": {}}
    curr_word = ""
    curr_word_idx = 0

    for i, c in enumerate(input_txt):
        if c != " ":
            curr_word += c
            try:
                input_tok = input_seq[curr_word_idx]
            except:
                input_tok = input_seq[curr_word_idx-1] #trailing spaces can cause this exception
            if curr_word == input_tok:
                s = i - len(curr_word) + 1
                e = i + 1 # since span is from [start, end)
                tok_idx_map["start"][s] = [curr_word_idx, curr_word]  # record what token starts here
                tok_idx_map["end"][e] = [curr_word_idx, curr_word] # record what token ends here
                curr_word = ""
                curr_word_idx += 1
    assert len(tok_idx_map["start"]) == len(tok_idx_map["end"])
    return tok_idx_map

def reverse_mapping(tok_idx_map):
    new_map = {}
    for k, v in tok_idx_map["start"].items():
        new_map[v[0]] = [v[1], k] # word, start
    for k, v in tok_idx_map["end"].items():
        assert k in new_map
        new_map[v[0]].append(k) # end
    return new_map

def get_answer_span(tok_idxs, input_txt):
    input_seq = tokenize(input_txt)
    s, e = tok_idxs
    return " ".join(input_seq[s:e+1])

def vectorize(input_seq, max_len):
    glove_vec = []
    for w in input_seq:
        try:
            glove_vec.append(glove.stoi[clean(w)])
        except:
            glove_vec.append(400001) # <unk> token
    if len(glove_vec)<max_len:
        padding_zeros = [400000]*(max_len-len(glove_vec)) # <pad> token
        glove_vec = padding_zeros + glove_vec
    return glove_vec[:max_len]
    
def make_data(raw_X, raw_y):
    X = []
    y = []
    for i, ((qid, c, q, a), (s, e)) in enumerate(zip(raw_X, raw_y)):
        c_tokens = tokenize(c.lower())
        try:
            tok_idx_map = token_idx_map(c)
        except:
            print(c, q, a)
        #pprint(tok_idx_map)
        #pprint(reverse_mapping(tok_idx_map))
        try:
            start_tok_idx, start_w = tok_idx_map["start"][s]
        except:
            for i in range(s, e):
                if i in tok_idx_map["start"]: # get next tok
                    start_tok_idx, start_w = tok_idx_map["start"][i]
                    break
        try:
            end_tok_idx, _ = tok_idx_map["end"][e] #only idx, not token
        except:
            for i in range(e, s, -1):
                if i in tok_idx_map["end"]: # get prev tok
                    end_tok_idx, end_w = tok_idx_map["end"][i]
                    break
        context_rep = vectorize(c_tokens, 600)
        q_tokens = tokenize(q.lower())
        ques_rep = vectorize(q_tokens, 100)
        X.append(context_rep+ques_rep)
        y.append((start_tok_idx, end_tok_idx))
    return X, y

DIM=50
glove = vocab.GloVe(name='6B', dim=DIM)

glove.stoi['<pad>'] = len(glove.stoi)+1
glove.vectors = torch.cat((glove.vectors, torch.zeros(1, DIM)))
glove.stoi['<unk>'] = len(glove.stoi)+1 # add token->index for unknown/oov
glove.vectors = torch.cat((glove.vectors, torch.ones(1, DIM)*-1)) # add index->vec for unknown/oov

print(glove.vectors.size())
VOCAB_SIZE = glove.vectors.size()[0]

torch.Size([400002, 50])


In [3]:
idx = 5
example_X = (data['X_train'][idx])
example_y = (data['y_train'][idx])
print("ID:", example_X[0])
print("Context:", example_X[1])
print("Question:", example_X[2])
print("Answer Span:", example_y)
print("Answer:", example_X[3])
X, y = make_data([example_X], [example_y])
pprint(y)
get_answer_span(y[0], example_X[1])

ID: 56ce602faab44d1400b88713
Context: A solar chimney (or thermal chimney, in this context) is a passive solar ventilation system composed of a vertical shaft connecting the interior and exterior of a building. As the chimney warms, the air inside is heated causing an updraft that pulls air through the building. Performance can be improved by using glazing and thermal mass materials in a way that mimics greenhouses.
Question: What kind of system is a solar chimney?
Answer Span: [59, 84]
Answer: passive solar ventilation
[(11, 13)]


'passive solar ventilation'

In [15]:
num_ex = 1000
X_pass, y_pass = make_data(data['X_train'][:num_ex], data['y_train'][:num_ex])

In [16]:
class ModelV1(nn.Module):
    def __init__(self, config):
        super(ModelV1, self).__init__()
        
        self.input_size = config.get("input_size", 700)
        self.hidden_size = config.get("hidden_size", 128)
        self.output_size = config.get("output_size", 600)
        self.n_layers = config.get("n_layers", 1)
        self.vocab_size = config.get("vocab", VOCAB_SIZE)
        self.emb_dim = config.get("embedding_dim", DIM)
        self.bidir = config.get("Bidirectional", True)
        self.dirs = int(self.bidir)+1
        self.lr = config.get("learning_rate", 1e-3)
        self.batch_size = config.get("batch_size", 1)
        self.epochs = config.get("epochs", 5)
        self.opt = config.get("opt", "SGD")
        self.print_every = config.get("print_every", 10)
        
        if self.opt == 'Adam':
            self.opt = optim.Adam
        else:
            self.opt = optim.SGD
        
        self.encoder = nn.Embedding(self.vocab_size, self.emb_dim)
        self.lstm = nn.LSTM(self.emb_dim, self.hidden_size, self.n_layers, bidirectional=self.bidir)
        self.decoder_start = nn.Linear(self.hidden_size, self.output_size)
        self.decoder_end = nn.Linear(self.hidden_size, self.output_size)
        self.init_weights()
    
    def init_weights(self):
        weight_scale = 0.01
        self.encoder.weight.data = glove.vectors
        self.decoder_start.bias.data.fill_(0)
        self.decoder_start.weight.data.uniform_(-weight_scale, weight_scale)
        self.decoder_end.bias.data.fill_(0)
        self.decoder_end.weight.data.uniform_(-weight_scale, weight_scale)

    def init_hidden(self, bs=None):
        if bs is None:
            bs = self.batch_size
        weight = next(self.parameters()).data
        return var(weight.new(self.n_layers*self.dirs, bs, self.hidden_size).zero_())
        
    def forward(self, inputs):
        if len(inputs)==1:
            inputs = var(torch.LongTensor(inputs[0]))
        else:
            inputs = var(torch.LongTensor(inputs))
        if len(inputs.size())<2:
            inputs = inputs.unsqueeze(0)
        embeds = self.encoder(inputs).permute(1, 0, 2) # get glove repr
        # print("embeds:", embeds.size())
        seq_len = embeds.size()[0]
        lstm_op, self.hidden = self.lstm(embeds, self.hidden)
        # print("lstm op:", lstm_op.size()) # (seq_len, bs, hidden_size*(dirs=2 for bi))
        lstm_op = lstm_op.permute(1, 0, 2) # (seq_len, bs, hdim)->(bs, seq_len, hdim)
        
        end_pred = lstm_op[:, -1, :self.hidden_size] # forward direction
        start_pred = lstm_op[:, -1, self.hidden_size:] # reverse direction
        
        # print("lstm start, end preds:", start_pred.size(), end_pred.size())
        out_start = F.log_softmax(self.decoder_start(start_pred), dim=-1)
        out_end = F.log_softmax(self.decoder_end(end_pred), dim=-1)
        # print("outs:", out_start.size(), out_end.size())
        out = torch.cat((out_start, out_end), 1)
        # print("out:", out.size())
        return out

    def fit(self, X, y):
        opt = self.opt(self.parameters(), self.lr)
        losses = [] # epoch loss
        for epoch in range(self.epochs):
            print("epoch:", epoch)
            bs = self.batch_size
            bloss = 0.0 # batch loss
            for bindex,  i in enumerate(range(0, len(y)-bs+1, bs)):
                #print("batch:", bindex)                    
                h, c = self.init_hidden(), self.init_hidden()
                self.hidden = (h, c)
                # print(h.size(), c.size())
                opt.zero_grad()
                Xb = X[i:i+bs]
                Xb = torch.LongTensor(Xb)
                # print("Xb:", Xb.size())
                yb = var(torch.LongTensor(y[i:i+bs]))
                # print("yb:", yb.size())
                pred = self.forward(Xb) #prediction on batch features
            
                loss = F.nll_loss(pred[:, :self.output_size], yb[:, 0]) \
                     + F.nll_loss(pred[:, self.output_size:], yb[:, 1]) 
                if bindex*(epoch+1)%self.print_every==0:
                    print("loss (batch):", loss.data[0])
                bloss += loss.data[0]/bs
                # print(bloss)
                loss.backward()
                opt.step()
            losses.append(bloss)
            print("loss (epoch):", losses[-1], end=', change: ')
            if len(losses)>1:
                diff = losses[-2]-losses[-1]
                rel_diff = diff/losses[-2]
                print("%s"%rel_diff, "%")
            else:
                print("00.0%")
        return losses

    def predict(self, X, bs=None):
        # self.hidden = (self.init_hidden(bs), self.init_hidden(bs))
        result = self.forward(X)
        return self.get_span_indices(result)
    
    def get_span_indices(self, preds):
        s_pred = preds[:, :self.output_size]
        e_pred = preds[:, self.output_size:]
        _,  s_index = torch.max(s_pred, -1)
        _,  e_index = torch.max(e_pred, -1)
        return torch.cat((s_index.unsqueeze(1), e_index.unsqueeze(1)), -1)

In [19]:
conf = {"learning_rate": 0.4, 
        "epochs": 5,
        "hidden_size": 64,
       "batch_size": 50,
       "opt": "Adam",
        "n_layers": 1
        }
model = ModelV1(conf)
print(model, model.lr, model.hidden_size, model.batch_size, model.opt)
model_name = "BiLSTM_%s_%s_%s_%s"%(num_ex, model.epochs, model.hidden_size, model.n_layers)
print(model_name)

ModelV1(
  (encoder): Embedding(400002, 50)
  (lstm): LSTM(50, 64, bidirectional=True)
  (decoder_start): Linear(in_features=64, out_features=600, bias=True)
  (decoder_end): Linear(in_features=64, out_features=600, bias=True)
) 0.4 64 50 <class 'torch.optim.adam.Adam'>
BiLSTM_1000_5_64_1


In [20]:
tic = time()
res = model.fit(X_pass, y_pass)
toc = time()
print("took", toc-tic, "seconds")
torch.save(model, '../evaluation/models/%s'%model_name)

epoch: 0
loss (batch): 12.792482376098633


KeyboardInterrupt: 

In [12]:
model = torch.load('../evaluation/models/%s'%model_name)

In [13]:
X_test_data, y_test_data = make_data(data['X_test'][:100], data['y_test'][:100])

In [14]:
dev_results = {}
for i, (x, y) in enumerate(zip(X_test_data[:1000], y_test_data[:1000])):
    qid = data['X_test'][i][0]
    c = data['X_test'][i][1]
    q = data['X_test'][i][2]
    #print(c)
    #print(q)
    a = data['X_test'][i][3]
    #print(a)
    res = model.predict([x], bs=1).data.tolist()[0]
    #print("Predicted span:", res)
    #if res[0]>res[1]:
    #    res[0], res[1] = res[1], res[0]
        #print("switched to:", res)
    #print("Predicted Answer:", c[res[0]:res[1]])
    #print("Actual:", a)
    print(res)
    ans = get_answer_span(res, c)
    #print("-"*50)
    #print(ans)
    dev_results[qid] = ans
    print("="*50)
with open('../data/run_500data_5epochs.json', 'w') as f:
    json.dump(dev_results, f)

RuntimeError: Expected hidden[0] size (2, 1, 64), got (2, 50, 64)

In [None]:
import matplotlib.pyplot as plt
plt.plot(list(range(len(res))), res)
plt.show()