## SQUAD Question Answering Dataset - Basic RNN
SQUAD - Stanford Question Answering Dataset is a new reading comprehension dataset. It consists of questions posed by crowd workers on a set of wikipedia articles where the answer to every question is a segment o text, or span, from the corresponding reading passage. There are 1,00,000+ question answer pairs on 500+ articles. 

### Formal definition of the SQUAD Question answering dataset
1. Given a three tuple (Q ,P , ($a_{s},a_{e}$)), where Q is the question, P is the context paragraph and $a_{s}$ and $a_{e}$ are the start and end indices. 

### Process
1. Both the question and the context vectors are first encoded into a LSTM. 
2. Word embeddings are passed onto the encoder that generated an attention vector which the decoder decodes to produce the final output. 
3. Can use 100 dimensional glove embeddings. 
4. Simple Baseline Model -> Encoder - decoder architecture. 
    1. Pointer networks and Coattention techniques result in significant improvements. 
5.

### Loss function for the SQUAD Model. 

1. Use tf.nn.embedding_lookup() to get the embeddings. 
2. 

### TODO:
1. DONE - Dataset importing from JSON format
2. DONE - Extracting data.
3. DONE - Downloading the Glove Vector model for the Word 2 vectors. Try with the 50 length vector first and then increase that. 
4. 

In [24]:
from __future__ import print_function
import os
import re
# import nltk
# import sklearn
import numpy as np
import tensorflow as tf
# from nltk import word_tokenize
import matplotlib.pyplot as plt
import traceback
# from sklearn.model_selection import train_test_split

Q_SIZE = 40  #Upper bound of max question size
C_SIZE = 250 #Upper bound of max context size

## RNN Model Parameters
DATA_CAP = 1000
batch_size = 64
num_epochs = 1

## Loading the Glove vectors

In [2]:
import csv
W2VEC_LEN = 50
GLOVE_path = "./preprocessing/data/glove.6B/glove.6B.50d.txt"
reader = csv.reader(open(GLOVE_path), delimiter=' ', quoting=csv.QUOTE_NONE) 
W2VEC = {line[0]: np.array(list(map(float, line[1: ]))) for line in reader}
del csv

In [3]:
# f = open('glove.6B.50d.txt')
# wordList = []
# embeddings = []
# model = {}

# for line in f:
#     splitLine = line.split()
#     word = splitLine[0]
#     wordList.append(word)
#     embedding = [float(val) for val in splitLine[1:]]
#     embeddings.append(embedding)
#     model[word] = embedding
# print (" Done: ",len(model)," loaded")

# ## Word List and the Word vectors of all the words.
# wordsList   = np.asarray(wordList)
# wordVectors = np.asarray(embeddings, dtype = np.float32 )

# W2VEC = wordVectors

## Loading the dataset for questions and answers. 

In [4]:
# from itertools import zip

path = "./preprocessing/data/squad/"
q_p = "train.question"
c_p = "train.context"
s_p = "train.span"

q_l = []; c_l = []; s_l = []
itr = 0

with open(path+q_p) as q_f, \
     open(path+c_p) as c_f, \
     open(path+s_p) as s_f:
            
    for q, c, s in zip( q_f, c_f, s_f):
        q_l.append(q), c_l.append(c); s_l.append(s)
        itr += 1
        
print ("All should be same:", itr, len(q_l), len(c_l), len(s_l))
samples = list(zip(q_l, c_l,s_l))

# combined_samples, test_samples = train_test_split(samples, test_size = 0.2, random_state = 2)
# train_samples, val_samples = train_test_split( combined_samples, test_size = 0.2, random_state = 2)

train_samples = samples[:int(0.6*len(samples))]
val_samples = samples[int(0.61*len(samples)):]


All should be same: 81403 81403 81403 81403


In [5]:
# train_samples = train_samples[:100]

In [7]:
# context_lengths = []
# question_lengths = []
# answer_lengths = []

# for i in range(len(c_l)):
#     context_lengths.append(len(vectorize(c_l[i])))

# for i in range(len(q_l)):
#     question_lengths.append(len(vectorize(q_l[i])))
    
# for i in range(len(s_l)):
#     answer_lengths.append(len(vectorize(s_l[i])))
    

In [8]:

# plt.hist(context_lengths)
# plt.title("Context Lengths distribution")
# plt.show()

# plt.hist(question_lengths)
# plt.title("Question Lengths distribution")
# plt.show()

# plt.hist(answer_lengths)
# plt.title("Answer Lengths distribution")
# plt.show()


### Model Hyper - Parameters

In [9]:
## RNN Model Parameters
embed_size = W2VEC_LEN
## Maximum length for the question
maxSeqLen_net1 = Q_SIZE

## Maximum length for the context paragraph
maxSeqLen_net2 = C_SIZE
maxSeqLen_decoder = maxSeqLen_net2

hidden_size = 32

## For fully connected layer
hidden_layer_size = 640

In [10]:
# print (s_l[0])
# idx = 0
# values = s_l[idx].split()
# values[0] = int(values[0])
# values[1] = int(values[1])
# span = values[:2]     
# print (span)     

In [45]:
def vectorize(sent, fill, clean=False):
    'Takes a sentence and returns corresponing list of GloVecs'
    if clean:
        sent = _dataCleaning(sent)
    words = sent.split(" ")
    words = words[:fill]  #Capping the context. Beware!!
    vecs = np.empty((1,W2VEC_LEN))
    
    for w in words:
        vec = W2VEC.get(w.lower(), None)
        if vec is None:
            vec = np.random.rand(W2VEC_LEN)
        vec = vec.reshape((1,W2VEC_LEN))
        vecs = np.concatenate((vecs, vec), axis=0)
    
    PADDING = np.zeros((1, W2VEC_LEN))
    for _ in np.arange(fill - len(words)):
        vecs = np.concatenate((vecs, PADDING), axis=0)
    return vecs[1:]
    
def _dataCleaning(string):
    strip_special_chars = re.compile("[^A-Za-z0-9 ]+")
    string = string.replace("<br />", " ")
    # The following replacements are suggested in the paper
    # BidAF (Seo et al., 2016)
    string = string.replace("''", '" ')
    string = string.replace("``", '" ')

    return re.sub(strip_special_chars, "", string)

#Vectorise all Questions and Contexts
def get_batch(cnt=64):
    """
    Returns Batch of 'cnt' elements from the dataset as vectorized numpy arrays
    @return: (questions, answers, labels) : All are vectorized numpy arrays 
    questions shape would be (cnt, Q_SIZE, W2VEC_LEN)
    Each entry of labels is a one-hot repr of span given
    
    The numpy concatenate function copies at every call and hence is 10X slower for large batches.
    Traditional Python lists append is a better fit here.
    """
    N = (len(q_l) if DATA_CAP is None else DATA_CAP)
    batch_ids = list(np.random.randint(0, N+1, cnt))
    q_vecs = []; c_vecs = []; label_vecs = []
    
    for idx in batch_ids:
        try:
            s_l[idx]
            values = s_l[idx].split()
            values[0] = int(values[0])
            values[1] = int(values[1])
            span = values[:2]
        except Exception as e:
            print("Id:", idx)
            print(traceback.format_exc())
            raise e
            span = [1000, 1000]
            
        if span[1] >= C_SIZE:
            replacement = np.random.randint(0, N, 1)[0]
            batch_ids.append(replacement)
            continue
        q_vec = vectorize(q_l[idx], fill=Q_SIZE)
        q_vecs.append(q_vec)
        c_vec = vectorize(c_l[idx], fill=C_SIZE, clean=True)
        c_vecs.append(c_vec)
        label_vec = np.zeros(C_SIZE)
        label_vec[span] = 1
        label_vecs.append(label_vec)
        
    q_vecs = np.array(q_vecs); c_vecs = np.array(c_vecs); label_vecs = np.array(label_vecs)
    return q_vecs, c_vecs, label_vecs
    #print(q_vecs.shape, c_vecs.shape, label_vecs.shape)

In [12]:
tf.reset_default_graph()

## Model the generator for the tensorflow model. 

In [31]:
# To get embeddings of all the words based on the index of the word.
def getEmbeds(sess, sentence, maxSeqLen):
    
    sentence = sentence[:maxSeqLen]
    sent_ids = []
    sentence_word2vecs = []
    
    if( len(sentence)>=maxSeqLen ):
        sentence = sentence[:maxSeqLen]
        
    else:    
        for i in range( maxSeqLen - len(sentence)):
            sentence.append('.')
    
    for i in range(len(sentence)):
        if (not sentence[i] in wordList):
            sent_ids.append(np.random.randint(35000))
        else:
            sent_ids.append(wordList.index(sentence[i]))
    
    for i in range(len(sentence)):
        sentence_word2vecs.append(wordVectors[sent_ids[i]])
    return sentence_word2vecs

    
def dataCleaning(string):
    
    strip_special_chars = re.compile("[^A-Za-z0-9 ]+")
    string = string.lower().replace("<br />", " ")
    return re.sub( strip_special_chars, "", string.lower())


def generator2(samples, session, batch_size = 32):
    
    num_samples = len(samples)
    
    while 1:
        
        for offset in range(0, num_samples, batch_size):
            
            batch_samples = samples[offset:offset+batch_size]
            
            labels = []
            question_embeddings = []
#             context_embeddings = []
            
            for batch_sample in batch_samples:
                
                question = batch_sample[0]
                context  = batch_sample[1]
                answer   = batch_sample[2]
                
                total = question + context
                cleaned_total   = total.split()
                
                total_embedding = getEmbeds(  session, cleaned_total, maxSeqLen_net1+maxSeqLen_net2)
                question_embeddings.append(total_embedding)
                
                present_label = answer.split(" ")
                label_array = [0]*maxSeqLen_net2
                index1 = int(present_label[0])
                index2 = int(present_label[1])
                index1 = min(index1,maxSeqLen_net2)
                index2 = min(index2, maxSeqLen_net2)
                
                if(index1 == index2):
                    label_array[index1-1] = 1
                else:
                    for i in range(index1, index2+1):
                        label_array[i-1] = 1

                labels.append(label_array)
                
            question_embeddings = np.asarray(question_embeddings)
            labels = np.asarray(labels)
            
            yield question_embeddings, labels
            
            
def generator(samples, session, batch_size = 32):
    
    num_samples = len(samples)
    
    while 1:
        for offset in range(0, num_samples, batch_size):
            
#             batch_samples = samples[offset:offset+batch_size]
#             labels = []
                
#             question_embeddings = np.zeros([batch_size, maxSeqLen_net1+maxSeqLen_net2, embed_size])
#             labels = np.zeros([batch_size, maxSeqLen_net2])
            q, c, l = get_batch(cnt=batch_size)
#             print(q.shape, c.shape, l.shape)
            q_c = np.concatenate([q, c], axis=1)
            yield q_c, l
        

In [16]:
input_ph_net1 = tf.placeholder(dtype = tf.float32, shape = (None, maxSeqLen_net1+maxSeqLen_net2 , embed_size))
labels_placeholder = tf.placeholder(dtype = tf.int64, shape = (None, maxSeqLen_net2))

with tf.variable_scope("rnn"):    
    weights = {
        'w_inp'   : tf.get_variable("w_inp", initializer = tf.contrib.layers.xavier_initializer(),   shape = [embed_size, hidden_size]),
        'w_hidden': tf.get_variable("w_hidden",initializer = tf.contrib.layers.xavier_initializer(),  shape = [hidden_size, hidden_size]),
        'wfc1': tf.get_variable("wfc1",initializer = tf.contrib.layers.xavier_initializer(), shape = [hidden_size, hidden_layer_size]),
        'w_out':  tf.get_variable("w_out",initializer = tf.contrib.layers.xavier_initializer(), shape = [hidden_layer_size, maxSeqLen_net2])
    }
    biases = {
        'b_hidden': tf.get_variable("b_hidden",initializer = tf.contrib.layers.xavier_initializer(), shape = [hidden_size]),
        'bfc1': tf.get_variable("bfc1",initializer = tf.contrib.layers.xavier_initializer(), shape = [hidden_layer_size]),
        'b_out': tf.get_variable("b_out",initializer = tf.contrib.layers.xavier_initializer(), shape = [maxSeqLen_net2])
    }
    

In [18]:
class RNNCell(tf.contrib.rnn.RNNCell):
    
    def __init__(self, input_size, state_size):
        self.input_size  = input_size
        self._state_size = state_size
        
    @property
    def state_size(self):
        return self._state_size
        
    @property
    def output_size(self):
        return self._state_size
        
    def __call__(self, inputs, state):
        
        
        state = tf.cast(state, tf.float32)
        
#         if(len(state.get_shape())<2):
#             state = tf.expand_dims(state, axis=0)
#         if(len(inputs.get_shape())<2):
#             inputs = tf.expand_dims(inputs, axis=0)
        
        with tf.variable_scope("rnn", reuse =True):
                
            W_x = tf.get_variable( 'w_inp', shape = (self.input_size, self.state_size), initializer = tf.contrib.layers.xavier_initializer(), dtype = tf.float32)
            W_h = tf.get_variable( 'w_hidden' , shape = (self.state_size, self.state_size), initializer = tf.contrib.layers.xavier_initializer(), dtype = tf.float32)
            b   = tf.get_variable( 'b_hidden' ,shape=(self.state_size), initializer = tf.contrib.layers.xavier_initializer(), dtype = tf.float32)
            h_t = tf.tanh( tf.matmul(state, W_h) + tf.matmul(inputs, W_x) + b)
            
        new_state = h_t
        
        return new_state
        

In [19]:
#     with tf.variable_scope("RNN", reuse = tf.AUTO_REUSE):
#         rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
#         initial_state = rnn_cell.zero_state(batch_size, dtype = tf.float32)
#         outputs,state = tf.nn.dynamic_rnn(rnn_cell, x, initial_state = initial_state, dtype = tf.float32)
    
def myNet1( x, state ):
    
    cell = RNNCell( embed_size, hidden_size)
    x_ = tf.unstack( x, axis=1)
    
    for i in range(len(x_)):
        state = cell( x_[i], state)
        
    return state
    
def FC_Net(input_fc):
    with tf.variable_scope("rnn"):
        fc1 = tf.add( tf.matmul(input_fc, weights['wfc1']),biases['bfc1'] )
        logits = tf.add( tf.matmul(fc1, weights['w_out']), biases['b_out'])
        return logits
        

In [20]:
with tf.variable_scope("rnn"):
    state = tf.zeros(shape = [batch_size, hidden_size], dtype = tf.float32)
    
prediction_net1 = myNet1( input_ph_net1, state)
print ("Prediction 1 shape is: ",prediction_net1.get_shape())
prediction3     = FC_Net(prediction_net1)
print (prediction3.get_shape())

pred_values = tf.argmax(prediction3,1)
label_values = tf.argmax(labels_placeholder,1)

correct_prediction = tf.equal( pred_values, label_values)
accuracy_prediction = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

print ("prediction 3: ",prediction3.get_shape())
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits( logits = prediction3, labels = labels_placeholder))

## Optimizer. 
optimizer = tf.train.AdamOptimizer(learning_rate = 1e-5).minimize(loss)

Prediction 1 shape is:  (64, 32)
(64, 250)
prediction 3:  (64, 250)


In [21]:
def f1_score(prediction, ground_truth):
    
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    
    if num_same == 0:
        return 0
    
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

def exact_match_score(prediction, ground_truth):
    return (normalize_answer(prediction) == normalize_answer(ground_truth))
        
def evaluate(dataset, predictions):
    
    f1 = exact_match = total = 0
    
    for article in dataset:
        for paragraph in article['paragraphs']:
            for qa in paragraph['qas']:
                
                total += 1
                if qa['id'] not in predictions:
                    message = 'Unanswered question ' + qa['id'] + \
                              ' will receive score 0.'
                    print(message, file=sys.stderr)
                    continue
                
                ground_truths = list(map(lambda x: x['text'], qa['answers']))
                prediction = predictions[qa['id']]
                exact_match += metric_max_over_ground_truths(
                    exact_match_score, prediction, ground_truths)
                f1 += metric_max_over_ground_truths(
                    f1_score, prediction, ground_truths)

    exact_match = 100.0 * exact_match / total
    f1 = 100.0 * f1 / total

    return {'exact_match': exact_match, 'f1': f1}

In [44]:
import time

saver = tf.train.Saver()
save_file = "models/model1.ckpt"

with tf.Session(config = tf.ConfigProto(log_device_placement = True)) as sess:
    
    sess.run(tf.global_variables_initializer())
    
    for i in range(num_epochs):
        
        print ("number of Epoch: ", i)
        batch_generator = generator(train_samples, sess, batch_size)
        st = time.time()
        for j in range( int(len(train_samples)/batch_size)):
            batchX, batchY = batch_generator.next()
            if(len(batchX)==batch_size):
                sess.run(optimizer, feed_dict ={input_ph_net1: batchX, labels_placeholder: batchY})                
                
                if(j%20==0):
                    loss_value, acc = sess.run( [loss, accuracy_prediction], feed_dict = {input_ph_net1: batchX, labels_placeholder: batchY})
                    et = time.time()
                    print ('Itr:', j,", Loss:",loss_value , " Accuracy is: ",acc, "Time: ", et-st)
                    st = et
                    
            
            
#         valid_generator = generator( val_samples, sess, batch_size = len(val_samples))
#         valX, valY = valid_generator.__next__()
#         pred_logits = sess.run( prediction3, feed_dict = feed_dict={input_ph_net1: valX, labels_placeholder: valY})
            
#         print ("The validation accuracy is: ",sess.run(loss, feed_dict={input_ph_net1: valX, labels_placeholder: valY}))
#     saver.save( sess, "model1.ckpt")
            

number of Epoch:  0
Itr: 0 , Loss: 9.71439  Accuracy is:  0.046875 Time:  0.62177491188
Itr: 20 , Loss: 9.03849  Accuracy is:  0.015625 Time:  2.07964515686
Itr: 40 , Loss: 9.32232  Accuracy is:  0.015625 Time:  2.05185604095
Itr: 60 , Loss: 9.09136  Accuracy is:  0.0 Time:  2.05866479874
Itr: 80 , Loss: 9.13525  Accuracy is:  0.03125 Time:  2.03135204315
Itr: 100 , Loss: 9.10408  Accuracy is:  0.03125 Time:  2.0183570385
Itr: 120 , Loss: 9.33399  Accuracy is:  0.0 Time:  2.01872396469
Itr: 140 , Loss: 9.39105  Accuracy is:  0.015625 Time:  2.02227115631
Itr: 160 , Loss: 9.32971  Accuracy is:  0.015625 Time:  2.02259683609
Itr: 180 , Loss: 8.40777  Accuracy is:  0.0 Time:  2.03353118896
Itr: 200 , Loss: 9.52085  Accuracy is:  0.0 Time:  2.02603292465
Itr: 220 , Loss: 9.43702  Accuracy is:  0.03125 Time:  2.01594591141
Itr: 240 , Loss: 9.02743  Accuracy is:  0.0 Time:  2.03871512413


KeyboardInterrupt: 