#### An attempt to create BILSTM model which could be used for point wise and pairwise ranking.

In [1]:
from __future__ import print_function

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import Dataset, DataLoader

import data_loader as dl
import os,json,pickle,time
import numpy as np

device = torch.device("cuda")

Using TensorFlow backend.


In [2]:
DEBUG = True

# if DEBUG:
#     print 'batch size is:', str(BATCH_SIZE), ' max length is:', str(MAX_LENGTH),' hidden size is:', str(HIDDEN_SIZE)

In [3]:
# LSTM encoder.
class Encoder(nn.Module):
    
    def __init__(self,max_length,hidden_dim,number_of_layer,embedding_dim,vocab_size,bidirectional,vectors=None):
        super(Encoder,self).__init__()
        
        self.max_length,self.hidden_dim,self.embedding_dim,self.vocab_size = max_length,hidden_dim,embedding_dim,vocab_size
        self.number_of_layer = number_of_layer
        self.bidirectional = bidirectional
        
        
        if vectors is not None:
            self.embedding_layer = nn.Embedding.from_pretrained(torch.FloatTensor(vectors))
            self.embedding_layer.weight.requires_grad = True
        else:
            # Embedding layer
            self.embedding_layer = nn.Embedding(self.vocab_size,self.embedding_dim)
        
        
        # LSTM layer
        self.lstm = nn.LSTM(self.embedding_dim,self.hidden_dim,self.number_of_layer,bidirectional = self.bidirectional)
        
    
    def init_hidden(self,batch_size,device):
        # Return a new hidden layer variable for LSTM
        # The axes semantics are (num_layers * num_directions, minibatch_size, hidden_dim)
        if not self.bidirectional:
            return (torch.zeros((self.number_of_layer,batch_size,self.hidden_dim),device=device),
                    torch.zeros((self.number_of_layer,batch_size,self.hidden_dim),device=device))
        else:
            return (torch.zeros((2*self.number_of_layer,batch_size,self.hidden_dim),device=device),
                    torch.zeros((2*self.number_of_layer,batch_size,self.hidden_dim),device=device))
    
    def forward(self,x,h):
        # x is the input and h is the hidden state.
        batch_size = x.shape[0]
        
        if DEBUG: print ("input/x shape is :", x.shape)
        if DEBUG: print ("hidden state shape is :", h[0].shape)
        
        x_embedded = self.embedding_layer(x)
        if DEBUG: print ("x_embedded transpose shape is :", x_embedded.transpose(1,0).shape)
            
#         output,h = self.lstm(x_embedded.view(-1,self.batch_size,self.embedding_dim),h)
        output,h = self.lstm(x_embedded.transpose(1,0),h)
        if DEBUG: print ("output shape is ",output.shape)
        if DEBUG: print ("h[0] shape is ", h[0].shape,"h[1] shape is ", h[1].shape)
        
        return output,h



In [4]:

MAX_LENGTH = 25
HIDDEN_SIZE = 10
BATCH_SIZE = 15
VOCAB_SIZE = 15000
EMBEDDING_DIM = 3

with torch.no_grad():
    print("testing encoder")
    dummy_input = torch.randint(0,VOCAB_SIZE-1,(MAX_LENGTH*BATCH_SIZE,),device=device).view(BATCH_SIZE,MAX_LENGTH).long()
    dummy_path = torch.randint(0,VOCAB_SIZE-1,(MAX_LENGTH*BATCH_SIZE,),device=device).view(BATCH_SIZE,MAX_LENGTH).long()
    encode = Encoder(MAX_LENGTH,HIDDEN_SIZE,5,EMBEDDING_DIM,VOCAB_SIZE,bidirectional=True).cuda(device)
    
    di_encoded,di_hidden = encode(dummy_input,encode.init_hidden(BATCH_SIZE,device))
    dp_encoded,dp_hidden = encode(dummy_path,encode.init_hidden(BATCH_SIZE,device))
    #di_encoded axes semantics = [MAX_LENGTH,BATCH_SIZE,2*Hidden_DIM]
    #reshaping to [BATCH_SIZE,MAX_LENGTH,2*HIDDEN_DIM]
    dot_product = torch.sum( di_encoded.view(BATCH_SIZE,MAX_LENGTH,-1)[:,-1,:]*dp_encoded.view(BATCH_SIZE,MAX_LENGTH,-1)[:,-1,:],-1)
    print(dot_product.shape)
#     dot = torch.bmm(di_encoded.view(BATCH_SIZE,MAX_LENGTH,-1),dp_encoded.view(BATCH_SIZE,MAX_LENGTH,-1))
#     print F.cosine_similarity(di_encoded,dp_encoded).shape
#     print torch.sum(torch.sum(dp_encoded.view(BATCH_SIZE,MAX_LENGTH,-1)*di_encoded.view(BATCH_SIZE,MAX_LENGTH,-1), -1),-1).shape


testing encoder
input/x shape is : torch.Size([15, 25])
hidden state shape is : torch.Size([10, 15, 10])
x_embedded transpose shape is : torch.Size([25, 15, 3])
output shape is  torch.Size([25, 15, 20])
h[0] shape is  torch.Size([10, 15, 10]) h[1] shape is  torch.Size([10, 15, 10])
input/x shape is : torch.Size([15, 25])
hidden state shape is : torch.Size([10, 15, 10])
x_embedded transpose shape is : torch.Size([25, 15, 3])
output shape is  torch.Size([25, 15, 20])
h[0] shape is  torch.Size([10, 15, 10]) h[1] shape is  torch.Size([10, 15, 10])
torch.Size([15])


In [5]:
MAX_LENGTH = 2
BATCH_SIZE = 3
HIDDEN_SIZE = 4

di_encoded = torch.tensor(np.arange(24).reshape((MAX_LENGTH, BATCH_SIZE, HIDDEN_SIZE)), dtype=torch.float)
dp_encoded = torch.randint(0,2,(MAX_LENGTH, BATCH_SIZE, HIDDEN_SIZE))


print(di_encoded, '\n', dp_encoded)
#BS,MA,HI

# print("dp_encoded.transpose(1,0) shape is \n ", dp_encoded.transpose(1,0)* di_encoded.transpose(1,0))
# a = dp_encoded.transpose(1,0)* di_encoded.transpose(1,0)
# b = 

print(di_encoded.view(BATCH_SIZE,MAX_LENGTH,-1).shape)
# dot_product_1 = torch.sum( di_encoded.view(BATCH_SIZE,MAX_LENGTH,-1)[:,-1,:]*dp_encoded.view(BATCH_SIZE,MAX_LENGTH,-1)[:,-1,:],-1)
# dot_product_2 = torch.(di_encoded.transpose(1,0)[:,-1,:]*dp_encoded.transpose(1,0)[:,-1,:],-1)
dot_product = torch.sum(torch.sum(dp_encoded.transpose(1,0)*di_encoded.transpose(1,0),1),1)
# print(dot_product_1)
print(dot_product)

tensor([[[  0.,   1.,   2.,   3.],
         [  4.,   5.,   6.,   7.],
         [  8.,   9.,  10.,  11.]],

        [[ 12.,  13.,  14.,  15.],
         [ 16.,  17.,  18.,  19.],
         [ 20.,  21.,  22.,  23.]]]) 
 tensor([[[ 1.,  0.,  0.,  0.],
         [ 1.,  0.,  1.,  1.],
         [ 0.,  1.,  0.,  0.]],

        [[ 0.,  0.,  1.,  0.],
         [ 1.,  0.,  0.,  0.],
         [ 0.,  1.,  0.,  0.]]])
torch.Size([3, 2, 4])
tensor([ 14.,  33.,  30.])


In [6]:
def load_relation():
    """
        Function used once to load the relations dictionary
        (which keeps the log of IDified relations, their uri and other things.)

    :param relation_file: str
    :return: dict
    """

    relations = pickle.load(open(os.path.join(COMMON_DATA_DIR, 'relations.pickle')))
    inverse_relations = {}
    for key in relations:
        value = relations[key]
        new_key = value[0]
        value[0] = key
        inverse_relations[new_key] = value

    return inverse_relations

In [7]:
COMMON_DATA_DIR = 'data/data/common'
_relations = load_relation()
_dataset = 'lcquad'
_dataset_specific_data_dir = 'data/data/lcquad/'
_model_specific_data_dir = 'data/data/core_chain_pairwise/lcquad/'
_file = 'id_big_data.json'
_max_sequence_length = 25
# _max_sequence_length = 15
_neg_paths_per_epoch_train = 5
_neg_paths_per_epoch_validation = 10
_training_split = .7
_validation_split = .8
_index = None

In [8]:
# _a = dl.load_data(_dataset, _dataset_specific_data_dir, _model_specific_data_dir, _file, _max_sequence_length,
#               _neg_paths_per_epoch_train,
#               _neg_paths_per_epoch_validation, _relations,
#               _index, _training_split, _validation_split, _model='core_chain_pairwise',_pairwise=True, _debug=True)
# train_questions, train_pos_paths, train_neg_paths, dummy_y_train, valid_questions, \
#                valid_pos_paths, valid_neg_paths, dummy_y_valid, test_questions, test_pos_paths, test_neg_paths = _a

# #Model specific paramters
# parameter_dict = {}
# parameter_dict['max_length'] = 25
# parameter_dict['hidden_size'] = 15
# parameter_dict['number_of_layer'] = 10
# parameter_dict['embedding_dim'] = 30
# parameter_dict['vocab_size'] = 15000
# parameter_dict['batch_size'] = 500
# parameter_dict['bidirectional'] = True
# parameter_dict['_neg_paths_per_epoch_train'] = 10
# parameter_dict['_neg_paths_per_epoch_validation'] = 1000
# parameter_dict['total_negative_samples'] = 1000
# parameter_dict['epochs'] = 2

# td = dl.TrainingDataGenerator(train_questions, train_pos_paths, train_neg_paths,
#                                                   _max_sequence_length, _neg_paths_per_epoch_train, BATCH_SIZE,10)
# trainLoader = DataLoader(td)

# vd = dl.ValidationDataset(train_questions, train_pos_paths, train_neg_paths,
#                                       parameter_dict['max_length'],
#                                       parameter_dict['_neg_paths_per_epoch_train'], parameter_dict['batch_size']
#                                       , parameter_dict['total_negative_samples'])
# validationLoader = DataLoader(vd)

# question_batch = ''
# pos_paths_batch = ''
# neg_paths_batch = ''
# for i_batch, sample_batched in enumerate(validationLoader):
#     question_batch =  sample_batched[0][0]
#     question_batch = np.reshape(question_batch,(-1,_max_sequence_length))
#     pos_paths_batch = np.reshape(sample_batched[0][1],(-1,_max_sequence_length))
#     neg_paths_batch = np.reshape(sample_batched[0][2],(-1,_max_sequence_length))
#     print pos_paths_batch.shape
#     print neg_paths_batch.shape
#     print question_batch.shape
#     break

# def validation_precision(questions,positive_paths,negative_paths):
#     #Assumption is that the number of negative paths are fixed.
#     for i in range(len(questions)):
#         question = np.repeat(question[i],len(negative_paths[i]))
#         paths = np.hstack((positive_paths[i],negative_paths[i]))
        
#     return 0

# question_batch = ''
# pos_paths_batch = ''
# neg_paths_batch = ''
# for i_batch, sample_batched in enumerate(trainLoader):
#     question_batch =  sample_batched[0][0]
#     question_batch = np.reshape(question_batch,(-1,_max_sequence_length))
#     pos_paths_batch = np.reshape(sample_batched[0][1],(-1,_max_sequence_length))
#     neg_paths_batch = np.reshape(sample_batched[0][2],(-1,_max_sequence_length))
#     break
# #     if 

# '''
#     arr_0 -> batch question
#     arr_1 -> batch_pos_path 
#     arr_2 -> batch_neg_paths
# '''
# question_batch = ''
# pos_paths_batch = ''
# neg_paths_batch = ''
# for i_batch, sample_batched in enumerate(trainLoader):
#     question_batch =  sample_batched[0][0]
#     question_batch = np.reshape(question_batch,(-1,_max_sequence_length))
#     pos_paths_batch = np.reshape(sample_batched[0][1],(-1,_max_sequence_length))
#     neg_paths_batch = np.reshape(sample_batched[0][2],(-1,_max_sequence_length))
#     break
# #     if i_batch == 1:
# #         print question_batch
# # #     print(i_batch, sample_batched['batch_questions'].size(),
# # #           sample_batched['batch_pos_paths'].size())

In [9]:
# with torch.no_grad():
#     print "testing encoder"
#     dummy_input = torch.randint(0,VOCAB_SIZE-1,(MAX_LENGTH*BATCH_SIZE,)).view(BATCH_SIZE,MAX_LENGTH).long()
#     dummy_path = torch.randint(0,VOCAB_SIZE-1,(MAX_LENGTH*BATCH_SIZE,)).view(BATCH_SIZE,MAX_LENGTH).long()
#     encode = Encoder(MAX_LENGTH,HIDDEN_SIZE,5,EMBEDDING_DIM,VOCAB_SIZE,BATCH_SIZE,bidirectional=True)
    
#     di_encoded,di_hidden = encode(dummy_input,encode.init_hidden())
#     dp_encoded,dp_hidden = encode(dummy_path,encode.init_hidden())
#     #di_encoded axes semantics = [MAX_LENGTH,BATCH_SIZE,2*Hidden_DIM]
#     #reshaping to [BATCH_SIZE,MAX_LENGTH,2*HIDDEN_DIM]
#     dot_product = torch.sum( di_encoded.view(BATCH_SIZE,MAX_LENGTH,-1)[:,-1,:]*dp_encoded.view(BATCH_SIZE,MAX_LENGTH,-1)[:,-1,:],-1)
#     print dot_product.shape

In [10]:
def validation_accuracy(valid_questions,valid_pos_paths,valid_neg_paths,model):
    precision = []
    with torch.no_grad():
        for i in range(len(valid_questions)):
            question = np.repeat(valid_questions[i].reshape(1,-1),len(valid_neg_paths[i])+1,axis=0) # +1 for positive path
            paths = np.vstack((valid_pos_paths[i].reshape(1,-1),valid_neg_paths[i]))
            
            hidden = model.init_hidden(question.shape[0],device)
            question = torch.tensor(question,dtype=torch.long,device = device)
            paths = torch.tensor(paths,dtype=torch.long,device=device)
            
            question,_ = model(question.long(),hidden)
            paths,_ = model(paths.long(),hidden)
            score = torch.sum(torch.sum(question.transpose(1,0)*paths.transpose(1,0),1),1)
            arg_max = torch.argmax(score)
            if arg_max.item() == 0:# 0 is the positive path index
                precision.append(1)
            else:
                precision.append(0)
    return sum(precision)*1.0/len(precision)

In [11]:
# Function to save the model
def save_model(loc, model, model_name = 'model.torch' ,epochs=0, optimizer=None,accuracy = 0):
    """
        Input:
            loc: str of the folder where the models are to be saved - data/models/core_chain/cnn_dense_dense/lcquad/5'
            models: dict of 'model_name': model_object
            epochs, optimizers are int, torch.optims (discarded right now).
    """
    
    state = {
        'epoch':epochs,
        'optimizer':optimizer.state_dict(),
        'state_dict':model.state_dict(),
        'accuracy':accuracy
    }
    loc = loc + '/' + model_name
    print("model with accuracy ", accuracy,  "stored at", loc)
    torch.save(state, loc)

In [12]:
def save_location(problem,model_name,dataset):
    '''
            Location - data/models/problem/model_name/dataset/0X
            problem - core_chain
                    -intent
                    -rdf
                    -type_existence
            model_name - cnn_dense_dense ; pointwise_cnn_dense_dense ....
            
            dataset - 
            return a dir data/models/problem/model_name/dataset/0X
    '''
    #Check if the path exists or not. If not create one. 
    assert(problem in ['core_chain','intent','rdf','type_existence'])
    assert(dataset in ['qald','lcquad','transfer-a','transfer-b','transfer-c'])
    
    path = 'data/models/' + str(problem) + '/' + str(model_name) + '/' + str(dataset)
    if not os.path.exists(path):
        os.makedirs(path)
    path
    dir_name =  [int(name) for name in os.listdir(path+'/') ]
    if not dir_name:
        new_path_dir = path + '/' + str(0)
        os.mkdir(new_path_dir)
    else:
        dir_name = max(dir_name)
        new_model = dir_name+1
        new_path_dir = path + '/' + str(new_model)
        os.mkdir(new_path_dir)
    return new_path_dir

In [17]:
def train_bilstm_dot(ques_batch,pos_batch,neg_batch,dummy_y,model,optimizer,loss_fn,batch_size,max_sequence_length,device):
    '''
        :params ques_batch: batch of question
        :params pos_batch: batch of corresponding positive paths
        :params neg_batch: batch of corresponding negative paths
        :params dummy_y:a batch of ones (same length as that of batch)
    '''
    
    hidden = model.init_hidden(ques_batch.shape[0],device)
    print ("pos batch is " , pos_batch[:10])
    print ("neg batch is ", neg_batch[:10])
    ques_batch, _ = model(ques_batch.long(),hidden)
    pos_batch, _ = model(pos_batch.long(),hidden)
    neg_batch, _ = model(neg_batch.long(),hidden)
    
#     print("ques batch shape after encoding is ", ques_batch.shape)
#     print("neg batch shape after encoding is ", neg_batch.shape)
#     print("transformed ques batch shape after encoding is ",  ques_batch.view(batch_size,max_sequence_length,-1).shape)
    #torch.sum(torch.sum(ques_batch.transpose(1,0)*ques_batch.transpose(1,0),1),1)
    pos_scores = torch.sum(torch.sum(ques_batch.transpose(1,0)*pos_batch.transpose(1,0),1),1)
    neg_scores = torch.sum(torch.sum(ques_batch.transpose(1,0)*neg_batch.transpose(1,0),1),1)
    print(pos_scores[:10])
    print(neg_scores[:10])
    '''
        If `y == 1` then it assumed the first input should be ranked higher
        (have a larger value) than the second input, and vice-versa for `y == -1`
    '''
    loss = loss_fn(pos_scores,neg_scores,dummy_y)
    loss.backward()
    optimizer.step()
    return loss

In [18]:
def training_loop(training_model,parameter_dict,data,dataset,device, problem = 'core_chain'):
    if training_model == 'bilstm_dot':
        # Find a location where model would be stored.
        model_save_location = save_location(problem,training_model,dataset)
        # model_save_locations - data/models/core_chain/cnn_dense_dense/lcquad/5
        
        #Model instantiation 
        encoder = Encoder(parameter_dict['max_length'],parameter_dict['hidden_size']
                         ,parameter_dict['number_of_layer'],parameter_dict['embedding_dim'],
                         parameter_dict['vocab_size'],
                         bidirectional=parameter_dict['bidirectional'],
                         vectors = data['vectors']).cuda(device)
        
        
        #Loading training data 
        td = dl.TrainingDataGenerator(data['train_questions'], data['train_pos_paths'], data['train_neg_paths'],
                                                  parameter_dict['max_length'], 
                                      parameter_dict['_neg_paths_per_epoch_train'], parameter_dict['batch_size']
                                      ,parameter_dict['total_negative_samples'])
        trainLoader = DataLoader(td)
        
        #dummy_y needed for calcualting loss
        dummy_y = torch.ones(parameter_dict['batch_size'],device=device)
        optimizer = optim.Adam(list(encoder.parameters()))
        max_margin_loss = nn.MarginRankingLoss(margin=1)
        train_loss = []
        test_loss = []
        valid_accuracy = []
        test_accuracy = []
        best_accuracy = 0
        
        for epoch in range(parameter_dict['epochs']):
            print("Epoch: ", epoch, "/", parameter_dict['epochs'])
            epoch_loss = []
            epoch_time = time.time()
            
            for i_batch, sample_batched in enumerate(trainLoader):
                batch_time = time.time()
#                 torch.tensor(train_P[sample_index], dtype=torch.long, device=device)
                ques_batch = torch.tensor(np.reshape(sample_batched[0][0],(-1,parameter_dict['max_length'])), dtype=torch.long, device=device)
                pos_batch =  torch.tensor(np.reshape(sample_batched[0][1],(-1,parameter_dict['max_length'])), dtype=torch.long, device=device)
                neg_batch =  torch.tensor(np.reshape(sample_batched[0][2],(-1,parameter_dict['max_length'])), dtype=torch.long, device=device)
                
                loss = train_bilstm_dot(ques_batch = ques_batch,
                                 pos_batch = pos_batch,
                                 neg_batch = neg_batch,
                                 dummy_y= dummy_y,
                                 model = encoder,
                                 optimizer=optimizer,
                                 loss_fn=max_margin_loss,
                                 batch_size=parameter_dict['batch_size'],
                                 max_sequence_length=parameter_dict['max_length'],
                                       device = device)
                epoch_loss.append(loss.item())
#                 print sum(epoch_loss,"  ",)
#                 print(i_batch)
                print("Batch:\t%d" % i_batch,"/%d\t: " % (i_batch/parameter_dict['batch_size']),
                       "%s" % (time.time() - batch_time),
                       "\t%s" % (time.time() - epoch_time),
                     "\t%s" % (str(loss.item())),
                      end=None if i_batch+1 == int(int(i_batch)/parameter_dict['batch_size']) else "\r")
            print("Time taken in epoch: %s" % (time.time() - epoch_time))
            print("Training loss is : %s" % (sum(epoch_loss)))
            
            train_loss.append(sum(epoch_loss))
            
            if epoch%5 == 0:
                #Calculating validation accuracy 
                valid_accuracy.append(validation_accuracy(data['valid_questions'],data['valid_pos_paths'],
                                                      data['valid_neg_paths'],encoder))
                #Calculating test accuracy  
                test_accuracy.append(validation_accuracy(data['test_questions'],data['test_pos_paths'],
                                                      data['test_neg_paths'],encoder))
                
                print("Validation accuracy is %s" % (valid_accuracy[-1]))
                print("Test accuracy is %s" % (test_accuracy[-1]))
                if valid_accuracy[-1] >= best_accuracy:
                    best_accuracy = valid_accuracy[-1]
                    save_model(model_save_location, encoder, model_name = 'encoder.torch' 
                               ,epochs=epoch, optimizer=optimizer ,accuracy = best_accuracy)
                    
                    
        return train_loss,encoder, valid_accuracy, test_accuracy
                
                               

In [15]:
#Model specific paramters
parameter_dict = {}
parameter_dict['max_length'] = 25
parameter_dict['hidden_size'] = 15
parameter_dict['number_of_layer'] = 128
parameter_dict['embedding_dim'] = 300
parameter_dict['vocab_size'] = 15000
parameter_dict['batch_size'] = 500
parameter_dict['bidirectional'] = True
parameter_dict['_neg_paths_per_epoch_train'] = 10
parameter_dict['_neg_paths_per_epoch_validation'] = 1000
parameter_dict['total_negative_samples'] = 1000
parameter_dict['epochs'] = 100


#Data loading specific parameters
COMMON_DATA_DIR = 'data/data/common'
_relations = load_relation()
_dataset = 'lcquad'
_dataset_specific_data_dir = 'data/data/lcquad/'
_model_specific_data_dir = 'data/data/core_chain_pairwise/lcquad/'
_file = 'id_big_data.json'
_max_sequence_length = parameter_dict['max_length']
# _max_sequence_length = 15
_neg_paths_per_epoch_train = parameter_dict['_neg_paths_per_epoch_train']
_neg_paths_per_epoch_validation = parameter_dict['_neg_paths_per_epoch_validation']
_training_split = .7
_validation_split = .8
_index = None



_a = dl.load_data(_dataset, _dataset_specific_data_dir, _model_specific_data_dir, _file, _max_sequence_length,
              _neg_paths_per_epoch_train,  
              _neg_paths_per_epoch_validation, _relations,
              _index, _training_split, _validation_split, _model='core_chain_pairwise',_pairwise=True, _debug=True)
train_questions, train_pos_paths, train_neg_paths, dummy_y_train, valid_questions, \
               valid_pos_paths, valid_neg_paths, dummy_y_valid, test_questions, test_pos_paths, test_neg_paths,vectors = _a



(3500, 25)
(3500, 25)
(3500, 1000, 25)
(500, 25)
(500, 25)
(500, 1000, 25)


In [19]:
DEBUG = False
data = {}
data['train_questions'] = train_questions
data['train_pos_paths'] = train_pos_paths
data['train_neg_paths'] = train_neg_paths
data['valid_questions'] = valid_questions
data['valid_pos_paths'] = valid_pos_paths
data['valid_neg_paths'] = valid_neg_paths
data['test_pos_paths'] = test_pos_paths
data['test_neg_paths'] = test_neg_paths
data['test_questions'] = test_questions
data['vectors'] = vectors

#training_model,parameter_dict,data,dataset,device, problem = 'core_chain'
training_loss,validation_accuracy,test_accuracy ,encoder = training_loop('bilstm_dot',parameter_dict,data,dataset='lcquad',device=device)

(3500, 25)
Epoch:  0 / 100
pos batch is  tensor([[    3,    89,   240,     2,  3547,  1212,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [    3,   207,     2,  2882,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [    3,  1739,     2,   683,  1881,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [    3,  1238,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [    3,   193,     9,  1518,     2,   649,   193,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
   

tensor([ 8.9009,  8.9009,  8.9009,  8.9009,  8.9009,  8.9009,  8.9009,
         8.9009,  8.9009,  8.9009], device='cuda:0')
tensor([ 8.9009,  8.9009,  8.9009,  8.9009,  8.9009,  8.9009,  8.9009,
         8.9009,  8.9009,  8.9009], device='cuda:0')
pos batch is  tensor([[    2,  5783,     3,   193,     9,  1518,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [    3,  1238,     2,  2885,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [    3,  1271,   193,     2,  1832,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [    2,  1503,   179,     3,  3109,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,

tensor([ 8.8867,  8.8867,  8.8867,  8.8867,  8.8867,  8.8867,  8.8867,
         8.8867,  8.8867,  8.8867], device='cuda:0')
tensor([ 8.8867,  8.8867,  8.8867,  8.8867,  8.8867,  8.8867,  8.8867,
         8.8867,  8.8867,  8.8867], device='cuda:0')
pos batch is  tensor([[    3,  4412,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [    3,   521,     3,  1520,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [    3,  1617,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [    3,  1276,     2,  1276,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,

tensor([ 8.8593,  8.8593,  8.8593,  8.8593,  8.8593,  8.8593,  8.8593,
         8.8593,  8.8593,  8.8593], device='cuda:0')
tensor([ 8.8593,  8.8593,  8.8593,  8.8593,  8.8593,  8.8593,  8.8593,
         8.8593,  8.8593,  8.8593], device='cuda:0')
pos batch is  tensor([[     2,   2895,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0],
        [     3,  12263,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0],
        [     2,   2227,      3,   2227,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0],
        [     2,  11607,   

tensor([ 8.8255,  8.8255,  8.8255,  8.8255,  8.8255,  8.8255,  8.8255,
         8.8255,  8.8255,  8.8255], device='cuda:0')
tensor([ 8.8255,  8.8255,  8.8255,  8.8255,  8.8255,  8.8255,  8.8255,
         8.8255,  8.8255,  8.8255], device='cuda:0')
pos batch is  tensor([[     2,     89,   5254,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0],
        [     3,    521,      2,    527,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0],
        [     3,   4144,      2,    649,    193,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0],
        [     3,   1822,   

tensor([ 8.7875,  8.7875,  8.7875,  8.7875,  8.7875,  8.7875,  8.7875,
         8.7875,  8.7875,  8.7875], device='cuda:0')
tensor([ 8.7875,  8.7875,  8.7875,  8.7875,  8.7875,  8.7875,  8.7875,
         8.7875,  8.7875,  8.7875], device='cuda:0')
pos batch is  tensor([[     3,   7630,      2,   3545,    177,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0],
        [     3,    649,    193,      3,    202,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0],
        [     2,  11710,      3,   1305,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0],
        [     2,   3235,   

tensor([ 8.7474,  8.7474,  8.7474,  8.7474,  8.7474,  8.7474,  8.7474,
         8.7474,  8.7474,  8.7474], device='cuda:0')
tensor([ 8.7474,  8.7474,  8.7474,  8.7474,  8.7474,  8.7474,  8.7474,
         8.7474,  8.7474,  8.7474], device='cuda:0')
pos batch is  tensor([[     3,    174,      9,   2695,      2,   1454,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0],
        [     2,  11710,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0],
        [     3,   3489,    119,      2,   1216,   1329,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0],
        [     3,    169,   

tensor([ 8.7096,  8.7096,  8.7096,  8.7096,  8.7096,  8.7096,  8.7096,
         8.7096,  8.7096,  8.7096], device='cuda:0')
tensor([ 8.7096,  8.7096,  8.7096,  8.7096,  8.7096,  8.7096,  8.7096,
         8.7096,  8.7096,  8.7096], device='cuda:0')
pos batch is  tensor([[    2,   725,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [    2,  2713,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [    3,  3774,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [    2,  1514,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,

tensor([ 8.6812,  8.6812,  8.6812,  8.6812,  8.6812,  8.6812,  8.6812,
         8.6812,  8.6812,  8.6812], device='cuda:0')
tensor([ 8.6812,  8.6812,  8.6812,  8.6812,  8.6812,  8.6812,  8.6812,
         8.6812,  8.6812,  8.6812], device='cuda:0')
pos batch is  tensor([[    2,   849,     3,  1305,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [    2,  1503,   179,     2,  2404,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [    2,  2287,  2323,   666,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [    2,  2652,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,

tensor([ 8.6537,  8.6537,  8.6537,  8.6537,  8.6537,  8.6537,  8.6537,
         8.6537,  8.6537,  8.6537], device='cuda:0')
tensor([ 8.6537,  8.6537,  8.6537,  8.6537,  8.6537,  8.6537,  8.6537,
         8.6537,  8.6537,  8.6537], device='cuda:0')
pos batch is  tensor([[    2,   725,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [    3,   477,  1050,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [    3,  3769,     2,  2141,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [    3,   649,   193,     2,   549,    13,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,

tensor([ 8.6238,  8.6238,  8.6238,  8.6238,  8.6238,  8.6238,  8.6238,
         8.6238,  8.6238,  8.6238], device='cuda:0')
tensor([ 8.6238,  8.6238,  8.6238,  8.6238,  8.6238,  8.6238,  8.6238,
         8.6238,  8.6238,  8.6238], device='cuda:0')
pos batch is  tensor([[     3,   1550,      2,   8536,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0],
        [     3,    354,      2,  11388,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0],
        [     2,   3181,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0],
        [     2,   1806,   

tensor([ 8.5876,  8.5876,  8.5876,  8.5876,  8.5876,  8.5876,  8.5876,
         8.5876,  8.5876,  8.5876], device='cuda:0')
tensor([ 8.5876,  8.5876,  8.5876,  8.5876,  8.5876,  8.5876,  8.5876,
         8.5876,  8.5876,  8.5876], device='cuda:0')
pos batch is  tensor([[     2,    527,      3,   6269,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0],
        [     3,   6112,      2,    649,    735,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0],
        [     3,    804,      3,    517,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0],
        [     2,  11710,   

tensor([ 8.5470,  8.5470,  8.5470,  8.5470,  8.5470,  8.5470,  8.5470,
         8.5470,  8.5470,  8.5470], device='cuda:0')
tensor([ 8.5470,  8.5470,  8.5470,  8.5470,  8.5470,  8.5470,  8.5470,
         8.5470,  8.5470,  8.5470], device='cuda:0')
pos batch is  tensor([[     3,   7583,      2,  12941,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0],
        [     2,   2227,      3,    336,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0],
        [     3,    207,      2,   2882,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0],
        [     3,   6200,   

tensor([ 8.5013,  8.5013,  8.5013,  8.5013,  8.5013,  8.5013,  8.5013,
         8.5013,  8.5013,  8.5013], device='cuda:0')
tensor([ 8.5013,  8.5013,  8.5013,  8.5013,  8.5013,  8.5013,  8.5013,
         8.5013,  8.5013,  8.5013], device='cuda:0')
pos batch is  tensor([[     2,   5902,      2,   2676,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0],
        [     3,   1190,   1894,   1117,      2,   1190,   1894,   1117,
              0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0],
        [     2,   6907,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0],
        [     2,  10379,   

tensor([ 8.4533,  8.4533,  8.4533,  8.4533,  8.4533,  8.4533,  8.4533,
         8.4533,  8.4533,  8.4533], device='cuda:0')
tensor([ 8.4533,  8.4533,  8.4533,  8.4533,  8.4533,  8.4533,  8.4533,
         8.4533,  8.4533,  8.4533], device='cuda:0')
pos batch is  tensor([[     3,    174,      9,   2695,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0],
        [     2,   1819,      3,   1819,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0],
        [     2,   3109,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0],
        [     2,   3179,   

tensor([ 8.4082,  8.4082,  8.4082,  8.4082,  8.4082,  8.4082,  8.4082,
         8.4082,  8.4082,  8.4082], device='cuda:0')
tensor([ 8.4082,  8.4082,  8.4082,  8.4082,  8.4082,  8.4082,  8.4082,
         8.4082,  8.4082,  8.4082], device='cuda:0')
pos batch is  tensor([[    3,  6137,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [    3,  2819,   245,     2,  2819,   245,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [    3,  2608,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [    3,  5163,   207,     3,   517,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,

tensor([ 8.3651,  8.3651,  8.3651,  8.3651,  8.3651,  8.3651,  8.3651,
         8.3651,  8.3651,  8.3651], device='cuda:0')
tensor([ 8.3651,  8.3651,  8.3651,  8.3651,  8.3651,  8.3651,  8.3651,
         8.3651,  8.3651,  8.3651], device='cuda:0')
pos batch is  tensor([[    3,  1739,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [    3,  2885,     2,  2885,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [    2,  3441,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [    3,  1141,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,

tensor([ 8.3253,  8.3253,  8.3253,  8.3253,  8.3253,  8.3253,  8.3253,
         8.3253,  8.3253,  8.3253], device='cuda:0')
tensor([ 8.3253,  8.3253,  8.3253,  8.3253,  8.3253,  8.3253,  8.3253,
         8.3253,  8.3253,  8.3253], device='cuda:0')
pos batch is  tensor([[     2,    459,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0],
        [     3,   6200,    391,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0],
        [     2,   6511,      3,   6511,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0],
        [     2,   3181,   

tensor([ 8.2916,  8.2916,  8.2916,  8.2916,  8.2916,  8.2916,  8.2916,
         8.2916,  8.2916,  8.2916], device='cuda:0')
tensor([ 8.2916,  8.2916,  8.2916,  8.2916,  8.2916,  8.2916,  8.2916,
         8.2916,  8.2916,  8.2916], device='cuda:0')
pos batch is  tensor([[     2,   9108,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0],
        [     3,   2217,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0],
        [     2,   4879,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0],
        [     3,    240,   

tensor([ 8.2579,  8.2579,  8.2579,  8.2579,  8.2579,  8.2579,  8.2579,
         8.2579,  8.2579,  8.2579], device='cuda:0')
tensor([ 8.2579,  8.2579,  8.2579,  8.2579,  8.2579,  8.2579,  8.2579,
         8.2579,  8.2579,  8.2579], device='cuda:0')
pos batch is  tensor([[    3,  4401,   119,     3,   743,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [    3,  1271,     2,   296,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [    3,  9108,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [    2,  2252,   527,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,

tensor([ 8.2261,  8.2261,  8.2261,  8.2261,  8.2261,  8.2261,  8.2261,
         8.2261,  8.2261,  8.2261], device='cuda:0')
tensor([ 8.2261,  8.2261,  8.2261,  8.2261,  8.2261,  8.2261,  8.2261,
         8.2261,  8.2261,  8.2261], device='cuda:0')
pos batch is  tensor([[    2,   597,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [    2,   666,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [    2,   761,   119,     2,  2882,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [    3,   160,     2,   888,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,

tensor([ 8.1987,  8.1987,  8.1987,  8.1987,  8.1987,  8.1987,  8.1987,
         8.1987,  8.1987,  8.1987], device='cuda:0')
tensor([ 8.1987,  8.1987,  8.1987,  8.1987,  8.1987,  8.1987,  8.1987,
         8.1987,  8.1987,  8.1987], device='cuda:0')
pos batch is  tensor([[     3,   2695,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0],
        [     3,   6817,      2,   2279,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0],
        [     2,    160,      3,    342,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0],
        [     2,    474,   

tensor([ 8.1760,  8.1760,  8.1760,  8.1760,  8.1760,  8.1760,  8.1760,
         8.1760,  8.1760,  8.1760], device='cuda:0')
tensor([ 8.1760,  8.1760,  8.1760,  8.1760,  8.1760,  8.1760,  8.1760,
         8.1760,  8.1760,  8.1760], device='cuda:0')
pos batch is  tensor([[    3,   177,     2,  1549,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [    2,   628,     3,  6416,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [    3,  1141,     2,  5457,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [    2,  1279,  2882,     3,  3774,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,

tensor([ 8.1600,  8.1600,  8.1600,  8.1600,  8.1600,  8.1600,  8.1600,
         8.1600,  8.1600,  8.1600], device='cuda:0')
tensor([ 8.1600,  8.1600,  8.1600,  8.1600,  8.1600,  8.1600,  8.1600,
         8.1600,  8.1600,  8.1600], device='cuda:0')
pos batch is  tensor([[    2,  5270,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [    3,  1338,   215,     3,   336,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [    3,  2815,     2,  2815,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [    3,  6778,     2,  1518,   193,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,

tensor([ 8.1514,  8.1514,  8.1514,  8.1514,  8.1514,  8.1514,  8.1514,
         8.1514,  8.1514,  8.1514], device='cuda:0')
tensor([ 8.1514,  8.1514,  8.1514,  8.1514,  8.1514,  8.1514,  8.1514,
         8.1514,  8.1514,  8.1514], device='cuda:0')
pos batch is  tensor([[    2,  9312,    11,     3,  9312,    11,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [    3,  2882,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [    3,  4144,     3,  3181,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [    3,  5270,     2,  4067,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,

tensor([ 8.1496,  8.1496,  8.1496,  8.1496,  8.1496,  8.1496,  8.1496,
         8.1496,  8.1496,  8.1496], device='cuda:0')
tensor([ 8.1496,  8.1496,  8.1496,  8.1496,  8.1496,  8.1496,  8.1496,
         8.1496,  8.1496,  8.1496], device='cuda:0')
pos batch is  tensor([[    3,   549,    13,     3,  1131,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [    3,     1,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [    2,     2,  1131,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [    2,   521,   177,     3,   193,     9,  1518,     0,     0,
             0,     0,     0,     0,     0,     0,     0,

tensor([ 8.1504,  8.1504,  8.1504,  8.1504,  8.1504,  8.1504,  8.1504,
         8.1504,  8.1504,  8.1504], device='cuda:0')
tensor([ 8.1504,  8.1504,  8.1504,  8.1504,  8.1504,  8.1504,  8.1504,
         8.1504,  8.1504,  8.1504], device='cuda:0')
pos batch is  tensor([[     3,   1518,    193,      2,  12184,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0],
        [     2,    571,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0],
        [     3,   3038,   1692,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0],
        [     3,    279,   

tensor([ 8.1557,  8.1557,  8.1557,  8.1557,  8.1557,  8.1557,  8.1557,
         8.1557,  8.1557,  8.1557], device='cuda:0')
tensor([ 8.1557,  8.1557,  8.1557,  8.1557,  8.1557,  8.1557,  8.1557,
         8.1557,  8.1557,  8.1557], device='cuda:0')
pos batch is  tensor([[    2,  4401,  6968,     2,  4390,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [    3,  1514,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [    3,  4879,     3,   202,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [    2,  1727,  5902,     3,  1988,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,

tensor([ 8.1680,  8.1680,  8.1680,  8.1680,  8.1680,  8.1680,  8.1680,
         8.1680,  8.1680,  8.1680], device='cuda:0')
tensor([ 8.1680,  8.1680,  8.1680,  8.1680,  8.1680,  8.1680,  8.1680,
         8.1680,  8.1680,  8.1680], device='cuda:0')
pos batch is  tensor([[     2,   1739,      2,   1190,    742,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0],
        [     2,   4144,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0],
        [     3,   4412,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,
              0],
        [     3,   3469,   

tensor([ 8.1865,  8.1865,  8.1865,  8.1865,  8.1865,  8.1865,  8.1865,
         8.1865,  8.1865,  8.1865], device='cuda:0')
tensor([ 8.1865,  8.1865,  8.1865,  8.1865,  8.1865,  8.1865,  8.1865,
         8.1865,  8.1865,  8.1865], device='cuda:0')


KeyboardInterrupt: 

In [None]:
def visualize_loss(loss, loss2=None, _label="Some label", _label2="Some other label", _name="Generic Name", _only_epoch=True):
    """
        Fn to visualize loss.
        Expects either
            - [int, int] for epoch level stuff
            - [ [int, int], [int, int] ] for batch level data. 
    """
    
    plt.rcParams['figure.figsize'] = [15, 8] 
    
    # Detect input format
    if type(loss[0]) is not list: #in [int, float, long]:
        
#         print("here")
        plt.plot(loss, '-b', label=_label)
        if loss2: plt.plot(loss2, '-r', label=_label2)
        plt.ylabel(_name)
        pylab.legend(loc='upper left')
        plt.show()
        
    elif type(loss[0]) == list:
        
        if _only_epoch:
            loss = [ np.mean(x) for x in loss ]
            if loss2 is not None: 
                loss2 = [ np.mean(x) for x in loss2 ]
            
        else:
            loss = [ y for x in loss for y in x ]
            if loss2 is not None: loss2 = [ y for x in loss2 for y in x ]
            
        plt.plot(loss, '-b', label=_label)
        if loss2 is not None: plt.plot(loss2, '-r', label=_label2)
        plt.ylabel(_name)
        pylab.legend(loc='upper left')
        plt.show() 

In [None]:
precision = validation_accuracy(valid_questions,valid_pos_paths,valid_neg_paths,encoder)


print("Training Loss")
visualize_loss(loss=training_loss, _name="train loss", _only_epoch=True)


print("Validation Accuracy")
visualize_loss(loss=validation_accuracy, _name="train loss", _only_epoch=True)

print("Testing Accuracy")
visualize_loss(loss=test_accuracy, _name="train loss", _only_epoch=True)

In [69]:
dummy_input_a = torch.randint(0,100,(500,25),device=device,dtype=torch.long)
dummy_input_b = torch.randint(0,100,(500,25),device=device,dtype=torch.long)
dummy_question = torch.randint(0,100,(500,25),device=device,dtype=torch.long)
loss_fun = nn.MarginRankingLoss(margin=1)
pos, _ = encode(dummy_input_a,encode.init_hidden(500,device))
neg, _ = encode(dummy_input_b,encode.init_hidden(500,device))
ques, _ = encode(dummy_question,encode.init_hidden(500,device))
print(pos[-1], '\n', neg[-1], '\n', ques[-1])
pos_score = torch.sum(ques[-1]*pos[-1],-1)
neg_score = torch.sum(ques[-1]*neg[-1],-1)


loss = loss_fun(pos_score,neg_score,-1*torch.ones(500,device=device))
print(loss)

tensor([[ 0.1833,  0.1017, -0.2031,  ...,  0.0125, -0.0054, -0.0197],
        [ 0.1826,  0.1026, -0.2023,  ...,  0.0127, -0.0052, -0.0197],
        [ 0.1824,  0.1030, -0.2023,  ...,  0.0130, -0.0057, -0.0201],
        ...,
        [ 0.1833,  0.1021, -0.2028,  ...,  0.0126, -0.0058, -0.0199],
        [ 0.1831,  0.1022, -0.2027,  ...,  0.0126, -0.0056, -0.0197],
        [ 0.1826,  0.1025, -0.2025,  ...,  0.0129, -0.0055, -0.0199]], device='cuda:0') 
 tensor([[ 0.1823,  0.1030, -0.2019,  ...,  0.0129, -0.0055, -0.0198],
        [ 0.1828,  0.1028, -0.2021,  ...,  0.0127, -0.0057, -0.0200],
        [ 0.1825,  0.1029, -0.2023,  ...,  0.0128, -0.0056, -0.0198],
        ...,
        [ 0.1823,  0.1022, -0.2025,  ...,  0.0129, -0.0051, -0.0197],
        [ 0.1832,  0.1021, -0.2026,  ...,  0.0128, -0.0059, -0.0201],
        [ 0.1825,  0.1032, -0.2022,  ...,  0.0127, -0.0057, -0.0199]], device='cuda:0') 
 tensor([[ 0.1827,  0.1024, -0.2022,  ...,  0.0126, -0.0051, -0.0194],
        [ 0.1821,  0.103