## SQUAD Question Answering Dataset - Basic RNN
SQUAD - Stanford Question Answering Dataset is a new reading comprehension dataset. It consists of questions posed by crowd workers on a set of wikipedia articles where the answer to every question is a segment o text, or span, from the corresponding reading passage. There are 1,00,000+ question answer pairs on 500+ articles. 

### Formal definition of the SQUAD Question answering dataset
1. Given a three tuple (Q,P,($a_{s},a_{e}$)), where Q is the question, P is the context paragraph and $a_{s}$ and $a_{e}$ are the start and end indices. 

### Process
1. Both the question and the context vectors are first encoded into a LSTM. 
2. Word embeddings are passed onto the encoder that generated an attention vector which the decoder decodes to produce the final output. 
3. Can use 100 dimensional glove embeddings. 
4. 

### Loss function for the SQUAD Model. 

### TODO:
1. Dataset importing from JSON format
2. Extracting data.
3. Downloading the Glove Vector model for the Word 2 vectors. Try with the 50 length vector first and then increase that. 
4. 

In [45]:
from __future__ import print_function
import os
import re
import numpy as np
import tensorflow as tf
from utils import *

%load_ext autoreload
%autoreload 2

Q_SIZE = 40  #Upper bound of max question size
C_SIZE = 500 #Upper bound of max context size

## RNN Model Parameters
DATA_CAP = None
batch_size = 64
num_epochs = 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Loading the Glove vectors

In [18]:
import csv
W2VEC_LEN = 50
GLOVE_path = "./preprocessing/data/glove.6B/glove.6B.50d.txt"
reader = csv.reader(open(GLOVE_path), delimiter=' ', quoting=csv.QUOTE_NONE) 
W2VEC = {line[0]: np.array(list(map(float, line[1: ]))) for line in reader}
del csv

In [19]:
def vectorize(sent, fill, clean=False):
    """Takes a sentence and returns corresponing list of GloVecs
    @return (ndarray of glovecs, actual_length)
    """
    if clean:
        pass #sent = _dataCleaning(sent)
    words = sent.split(" ")
    words = words[:fill]  #Capping the context. Beware!!
    vecs = np.empty((1,W2VEC_LEN))
    for w in words:
        vec = W2VEC.get(w.lower(), None)
        if vec is None:
            vec = np.random.rand(W2VEC_LEN)
        vec = vec.reshape((1,W2VEC_LEN))
        vecs = np.concatenate((vecs, vec), axis=0)
    
    PADDING = np.zeros((1, W2VEC_LEN))
    for _ in np.arange(fill - len(words)):
        vecs = np.concatenate((vecs, PADDING), axis=0)
    return vecs[1:], len(words)

def _dataCleaning(string):
    strip_special_chars = re.compile("[^A-Za-z0-9 ]+")
    string = string.replace("<br />", " ")
    string = string.replace("''", '" ')
    string = string.replace("``", '" ')
    return re.sub(strip_special_chars, "", string)


## Loading the dataset for questions and answers. 

In [20]:
## Loading the dataset here. 
from itertools import izip
path = "./preprocessing/data/squad/"
#Paths
q_p = "train.question"; c_p = "train.context"; s_p = "train.span"
#Lines
q_l = []; c_l = []; s_l = []

itr = 0
with open(path+q_p) as q_f, \
     open(path+c_p) as c_f, \
     open(path+s_p) as s_f:
    for q, c, s in izip(q_f, c_f, s_f):
        c = _dataCleaning(c)
        q_l.append(q), c_l.append(c); s_l.append(s)
        itr += 1

print("#Entries#:All should be same:", itr, len(q_l), len(c_l), len(s_l))

#Entries#:All should be same: 81403 81403 81403 81403


In [21]:
#Vectorise all Questions and Contexts
def get_batch(cnt=64, filtr=True):
    """
    @param filtr: Filters only entries with context less than C_SIZE
    
    Returns Batch of 'cnt' elements from the dataset as vectorized numpy arrays
    @return: (questions, answers, labels, ids) : The first three being numpy vectorised arrays.
                                                'ids' is a list of corresponding entry ids in q_l,c_l,s_l
                                                questions shape would be (cnt, Q_SIZE, W2VEC_LEN)
    Each entry of labels is a one-hot repr of span given
    
    The numpy concatenate function copies at every call and hence is 10X slower for large batches.
    Traditional Python lists append is a better fit here.
    """
    N = (len(q_l) if DATA_CAP is None else DATA_CAP)
    batch_ids = list(np.random.randint(0, N, cnt))
    rmv_ids = []
    q_vecs = []; c_vecs = []; label_vecs = []; q_lens = []; c_lens = []

    #q_vecs = np.empty((1, Q_SIZE, W2VEC_LEN)); c_vecs = np.empty((1, C_SIZE, W2VEC_LEN))
    for idx in batch_ids:
        #q_vec = vectorize(q_l[idx], fill=Q_SIZE).reshape((1, Q_SIZE, W2VEC_LEN))
        #q_vecs = np.concatenate((q_vecs, q_vec), axis=0)
        #c_vec = vecty
        #vecorize(c_l[idx], fill=C_SIZE).reshape((1, C_SIZE, W2VEC_LEN))
        #c_vecs = np.concatenate((c_vecs, c_vec), axis=0)
        span = map(int, s_l[idx].split())
        try:
            if span[1] >= C_SIZE and filtr:
                replacement = np.random.randint(0, N, 1)[0]
                rmv_ids.append(idx)
                batch_ids.append(replacement)
                continue
        except Exception as e:
            print(e, "id:", idx)
            continue
        q_vec, q_actual_len = vectorize(q_l[idx], fill=Q_SIZE)
        q_vecs.append(q_vec); q_lens.append(q_actual_len)
        c_vec, c_actual_len = vectorize(c_l[idx], fill=C_SIZE, clean=True)
        c_vecs.append(c_vec); c_lens.append(c_actual_len)
        
        label_vec = np.zeros(2*C_SIZE) #start_end_vec + end_span_vec
        idx = [span[0], C_SIZE + span[1]]
        label_vec[idx] = 1
        label_vecs.append(label_vec)

    q_vecs = np.array(q_vecs); c_vecs = np.array(c_vecs); label_vecs = np.array(label_vecs)
    q_lens = np.array(q_lens); c_lens = np.array(c_lens)
    
    batch_ids = set(batch_ids); rmv_ids = set(rmv_ids)
    final_ids = batch_ids.difference(rmv_ids)
    return (q_vecs, q_lens), (c_vecs, c_lens), label_vecs, final_ids
    #print(q_vecs.shape, c_vecs.shape, label_vecs.shape) 

## Visualize the dataset

In [22]:
context_lengths = []
question_lengths = []
span_indxs = []

for i in range(len(c_l)):
    context_lengths.append(len(c_l[i].split()))

for i in range(len(q_l)):
    question_lengths.append(len(q_l[i].split()))

for i in range(len(s_l)):
    span_indxs.append(int(s_l[i].split()[1]))
    
print("Max question:", max(question_lengths), "MAx context:", max(context_lengths), "Max span:", max(span_indxs))

Max question: 60 MAx context: 655 Max span: 605


In [23]:
import matplotlib.pyplot as plt
plt.hist(context_lengths)
plt.title("Context Lengths distribution")
# plt.show()
plt.hist(question_lengths)
plt.title("Question Lengths distribution")
# plt.show()

# plt.hist(answer_lengths)
# plt.title("Answer Lengths distribution")
# plt.show()
del plt

## Model the generator for the tensorflow model. 

In [24]:
tf.reset_default_graph()

In [25]:
st = """def generator(samples, session, batch_size = 32):
    num_samples = len(samples)
    
    while 1:
        sklearn.utils.shuffle(samples)
        
        for offset in range(0, num_samples, batch_size):
            batch_samples = samples[offset:offset+batch_size]
            labels = []
            reviews_embeddings = []
            
            for batch_sample in batch_samples:
"""

In [26]:
class MyRNNCell(tf.nn.rnn_cell.RNNCell):
  """The most basic RNN cell.
  Args:
    num_units: int, The number of units in the RNN cell.
    activation: Nonlinearity to use.  Default: `tanh`.
    reuse: (optional) Python boolean describing whether to reuse variables
     in an existing scope.  If not `True`, and the existing scope already has
     the given variables, an error is raised.
  """

  def __init__(self, num_units, activation=None, reuse=None):
    super(MyRNNCell, self).__init__(_reuse=reuse)
    self._num_units = num_units

  @property
  def state_size(self):
    return self._num_units

  @property
  def output_size(self):
    return self._num_units

  def call(self, inputs, state):
    """Most basic RNN: output = new_state = act(W * input + U * state + B)."""
    
    # reshape vectors to matrices
    state_prev = tf.reshape(state, [1, self.state_size])
    x = tf.reshape(x, [1,state_size])
    # initializer
    xav_init = tf.contrib.layers.xavier_initializer
    # params
    Whh = tf.get_variable('Whh', shape=[hsize, hsize], initializer=xav_init())
    Wih = tf.get_variable('Wih', shape=[state_size, hsize], initializer=xav_init())
    b = tf.get_variable('b', shape=[hsize], initializer=tf.constant_initializer(0.001))
    
    
    # current hidden state
    h = tf.tanh(tf.matmul(hprev, W) + tf.matmul(x,U) + b)
    h = tf.reshape(h, [hsize])

    output = tf.tanh(tf.matmul(state, W_h) + tf.matmul(inputs, W_x) + b) #_linear([inputs, state], self._num_units, True))
    return output


In [27]:
#Question Module
q_state_sz = 64

def question_module(init_state=None):
    '''
    @arg init_state :tf.Tensor of shape (batch_size, q_state_sz)
    
    @return (op, state, c_batch) where op(output) and state are of shape (batch_size, q_state_sz)
            and q_batch is the question_batched input placeholder
    '''
    q_batch = tf.placeholder(tf.float32, [Q_SIZE, batch_size, W2VEC_LEN])
    seq_lens = tf.placeholder(tf.int32, [batch_size,])

    #Define RNN Cell
    q_cell = tf.nn.rnn_cell.BasicRNNCell(q_state_sz)
    #LSTM version: Note that the state o/p of LSTM is different: It is a LSTMStateTuple:(cell, state)
    #q_cell = tf.nn.rnn_cell.BasicLSTMCell(q_state_sz, forget_bias=1.0)
    
    #Default initial state is all zeros
    outputs, state = tf.nn.dynamic_rnn(q_cell, q_batch,
                                      initial_state=init_state,
                                      dtype=tf.float32, time_major=True,
                                      sequence_length=seq_lens)

    return outputs, state, q_batch, seq_lens

In [28]:
#Context Module
c_state_sz = 64
def context_module(init_state):
    '''
    @arg init_state :tf.Tensor of shape (batch_size, c_state_sz)

    @return (op, state, c_batch) where op(output) and state are of shape (batch_size, c_state_sz)
            and c_batch is the context_batched input placeholder
    '''
    if init_state is None: raise ValueError("Dnt Kid me!!. Give a state")
    c_batch = tf.placeholder(tf.float32, [C_SIZE, batch_size, W2VEC_LEN])
    seq_lens = tf.placeholder(tf.int32, [batch_size,])
    
    context_cell = tf.nn.rnn_cell.BasicRNNCell(c_state_sz)
    #LSTM version: Note that the state o/p of LSTM is different: It is a LSTMStateTuple:(cell, state)
    #context_cell = tf.nn.rnn_cell.BasicLSTMCell(c_state_sz, forget_bias=1.0)

#     outputs, state = tf.nn.static_rnn(context_cell, x,
#                                        initial_state=init_state,
#                                        dtype=tf.float32)
    outputs, state = tf.nn.dynamic_rnn(context_cell, c_batch,
                                  initial_state=init_state,
                                  dtype=tf.float32, time_major=True,
                                  sequence_length=seq_lens)


    return outputs, state, c_batch, seq_lens
    

In [None]:
#Answer Module
def _init_wts(shape):
    bound = np.sqrt(6.0 / sum(shape))
    wts = tf.random_uniform(shape, minval=-bound, maxval=bound, dtype=tf.float32)
    #wts = tf.random_normal(shape, stddev=np.sqrt(2.0 / np.sum(shape)), dtype=tf.float64)
    return tf.Variable(wts)

def _init_bias(sz):
    b = np.ones(sz)*0.001
    return tf.Variable(b, dtype=tf.float32)

def answer_module(ip_state, layers=[]):
    '''
    ip_state: tf.Tensor of shape (batch_sz, C_SIZE)
    layers: List of hidden_layer sizes to be used
    
    @return: tf.Tensor variables of loss gradient and loss and labes placeholder
    '''
    prev_feature_sz = c_state_sz
    conditioned_state =  ip_state  #State(Context|Question)

    X = conditioned_state
    
    for layer in layers:
        w = _init_wts((prev_feature_sz, layer))
        b = _init_bias(layer)
        prev_feature_sz = layer
        h_l = tf.nn.relu(tf.matmul(X, w) + b)
        X = h_l
    
    #Final O/P layer
    w_s = _init_wts((prev_feature_sz, C_SIZE)); b_s = _init_bias(C_SIZE)
    w_e = _init_wts((prev_feature_sz, C_SIZE)); b_e = _init_bias(C_SIZE)
    
    logits_s = tf.matmul(X, w_s) + b_s
    logits_e = tf.matmul(X, w_e) + b_e

    return logits_s, logits_e
        

In [30]:
def loss_fn(logits_s, logits_e):
    labels_holder = tf.placeholder(tf.float32, shape=(2, batch_size, C_SIZE))
    loss_s = tf.nn.softmax_cross_entropy_with_logits(logits=logits_s, labels=labels_holder[0,:])
    loss_e  = tf.nn.softmax_cross_entropy_with_logits(logits=logits_e, labels=labels_holder[1,:])
    
    probs_s = tf.nn.softmax(logits_s)
    probs_e = tf.nn.softmax(logits_e)
    preds_s = tf.argmax(probs_s, axis=1)
    preds_e = tf.argmax(probs_e, axis=1)
    
    loss_s_mean = tf.reduce_mean(loss_s)
    loss_e_mean  = tf.reduce_mean(loss_e)
    loss = tf.add(loss_s_mean, loss_e_mean)
    
    #optimizer = tf.train.GradientDescentOptimizer(learning_rate = 1e-5).minimize(loss)
    optimizer = tf.train.AdamOptimizer()
    loss_grad = optimizer.minimize(loss)  #Gradient Clipping is part of it    
    
    return loss_grad, loss, labels_holder, preds_s, preds_e

In [36]:
#BUILD the Computational graph
tf.reset_default_graph()

with tf.variable_scope("question_module"):
    q_ops, q_state, q_batch_ip, q_seq_lens_holder = question_module(None)
with tf.variable_scope("context_module"):
    c_ops, c_state, c_batch_ip, c_seq_lens_holder = context_module(q_state)
with tf.variable_scope("answer_module"):
    logits_s, logits_e = answer_module(c_state)
with tf.variable_scope("loss_module"):
    loss_grad_train, loss_train, labels_ip_train, preds_s, preds_e = loss_fn(logits_s, logits_e)


In [58]:
#Evaluate a random sample and get metrics
def get_metrics(dataset, sess=None, sample=100):
    '''
    @param dataset: tuple of (List of questions, List of contexts, List of spans)
    '''
    q_l, c_l, s_l = dataset
    
    #Placeholders to be filled: q_batch_ip, q_seq_lens_holder, c_seq_lens_holder, c_batch_ip, labels_ip_train
    
    if sess == None: 
        sess = tf.Session()
        init = tf.global_variables_initializer()
        sess.run(init)
        
    (q_vecs, q_lens), (c_vecs, c_lens), label_vecs, idx = get_batch(cnt=batch_size)
    
    q_vecs_ip = q_vecs.transpose(1,0,2) #To make it to (time_steps, batch_size, word_vec_len_features) shape
    c_vecs_ip = c_vecs.transpose(1,0,2) #To make it to (time_steps, batch_size, word_vec_len_features) shape
    label_vecs = label_vecs.reshape(2, batch_size, C_SIZE)
    
    start, end = sess.run([preds_s, preds_e], feed_dict={q_batch_ip : q_vecs_ip,
                                              q_seq_lens_holder : q_lens,
                                              c_seq_lens_holder : c_lens,
                                              c_batch_ip : c_vecs_ip,
                                              labels_ip_train: label_vecs})
    
    em_scr = 0; f1_scr = 0
    common_len = 0; pred_len = 0; grnd_len = 0
    for _id, (s, e) in enumerate(zip(start, end)):
        pred_ans_sent = get_answer_txt(s, e, c_l[_id])
        gs, ge = map(int, s_l[_id].split())
        ground_truth  = get_answer_txt(gs, ge, c_l[_id])
        common_l, pred_l, grnd_l = count_tokens(pred_ans_sent, ground_truth)
        common_len += common_l; pred_len += pred_l; grnd_len += grnd_l
        em_scr += exact_match_score(pred_ans_sent, ground_truth)
        
    f1_scr += f1_score(common_len, pred_len, grnd_len)
    
    return em_scr, f1_scr
        
# get_metrics((q_l, c_l, s_l))

In [59]:
import traceback
import time

def train():
    N = (len(q_l) if DATA_CAP is None else DATA_CAP)
    itr = 0
    
    for _ in range(num_epochs):
        print("Epoch: ", _)
        st = time.time()
        for batches in np.arange(N/batch_size):
            print(batches," ", end="")

            (q_vecs, q_lens), (c_vecs, c_lens), label_vecs, _ = get_batch(cnt=batch_size)
            q_vecs_ip = q_vecs.transpose(1,0,2) #To make it to (time_steps, batch_size, word_vec_len_features) shape
            c_vecs_ip = c_vecs.transpose(1,0,2) #To make it to (time_steps, batch_size, word_vec_len_features) shape
            label_vecs = label_vecs.reshape(2, batch_size, C_SIZE)
#             print(sess.run(c_state, feed_dict={q_batch_ip : q_vecs_ip,
#                                                c_batch_ip : c_vecs_ip}))
#             sess.run(q_state, feed_dict={q_batch_ip : q_vecs_ip,
#                                            c_batch_ip : c_vecs_ip,
#                                            labels_ip_train : label_vecs})
            
            ret = sess.run(loss_train, feed_dict={q_batch_ip : q_vecs_ip,
                                                  q_seq_lens_holder : q_lens,
                                                  c_seq_lens_holder : c_lens,
                                                  c_batch_ip : c_vecs_ip,
                                                  labels_ip_train : label_vecs})
            itr += 1
            if itr % 5 == 0:
                end = time.time()
                loss = sess.run(loss_train, feed_dict={q_batch_ip : q_vecs_ip,
                                                       q_seq_lens_holder : q_lens, 
                                                       c_seq_lens_holder : c_lens,
                                                       c_batch_ip : c_vecs_ip,
                                                       labels_ip_train : label_vecs})
                em, f1 = get_metrics((q_l, c_l, s_l))
                print("\nItr: %s, Loss:%s, Acc:%s, Time:%s"%(itr, loss, f1, end-st))
                st = end

#tf.reset_default_graph()
init = tf.global_variables_initializer()
with tf.Session() as sess:
    sess.run(init)
    train()

Epoch:  0
0  1  2  3  4  
Itr: 5, Loss:12.4473, Acc:0.0320134793597, Time:2.02008700371
5  6  7  8  9  
Itr: 10, Loss:12.4301, Acc:0.0218265364733, Time:2.55919694901
10  11  12  13  14  
Itr: 15, Loss:12.4505, Acc:0.0507936507937, Time:2.6016600132
15  16  17  18  19  
Itr: 20, Loss:12.4235, Acc:0.0394618834081, Time:2.574696064
20  21  22  23  24  
Itr: 25, Loss:12.4117, Acc:0.0172626387176, Time:2.47731184959
25  26  27  28  29  
Itr: 30, Loss:12.4507, Acc:0.0170575692964, Time:2.48838019371
30  31  32  33  34  
Itr: 35, Loss:12.4342, Acc:0.0339805825243, Time:2.62697100639
35  36  37  38  39  
Itr: 40, Loss:12.4571, Acc:0.00308166409861, Time:2.49612283707
40  41  42  43  44  
Itr: 45, Loss:12.4266, Acc:0.0471894517696, Time:2.48649501801
45  46  47  48  49  
Itr: 50, Loss:12.4423, Acc:0.0575539568345, Time:2.49991607666
50  51  52  53  54  

KeyboardInterrupt: 