In [1]:
import tensorflow as tf
import sys
import numpy as np
from tensorflow.python.ops import array_ops
from tensorflow.python.platform import gfile
import logging
import time
import tensorflow_addons as tfa
# tf.reset_default_graph()

logging.basicConfig(stream = sys.stdout, level=logging.INFO)

In [2]:
print(tf.__version__)

2.1.0


In [3]:
class Config:
    num_epochs = 30
    batch_size = 32
    train_embeddings=0
    max_gradient_norm=-1
    hidden_state_size=150
    embedding_size=50
    data_dir="data/preprocessed"
    vocab_path="vocabulary/vocab.dat"
    embed_path="glove_vectors/_vectors.npz"
    dropout_val=1.0
    train_dir="models_lstm_basic"
    use_match=0
    

    def get_paths(mode):
        question = "data/preprocessed/"+str(mode)+".ids.question" 
        context = "data/preprocessed/"+str(mode)+".ids.context" 
        answer = "data/preprocessed/"+str(mode)+".span"

        return question, context, answer 

    question_train, context_train, answer_train = get_paths("train")
    question_dev ,context_dev ,answer_dev = get_paths("dev")

In [4]:
class squad_dataset(object):
    def __init__(self, question_file, context_file, answer_file):
        """
        Args:
            filename: path to the files
        """
        self.question_file = question_file
        self.context_file = context_file
        self.answer_file = answer_file

        self.length = None

    def iter_file(self, filename):
        with open(filename) as f:
            for line in f:
                line = line.strip().split(" ")
                line = map(lambda tok: int(tok), line)
                yield line


    def __iter__(self):
        niter = 0

        question_file_iter = self.iter_file(self.question_file)
        answer_file_iter = self.iter_file(self.answer_file)
        context_file_iter = self.iter_file(self.context_file)

        for question, context, answer in zip(question_file_iter, context_file_iter, answer_file_iter):
            yield list(question),list(context), list(answer)



    def __len__(self):
        """
        Iterates once over the corpus to set and store length
        """
        if self.length is None:
            self.length = 0
            for _ in self:
                self.length += 1

        return self.length

In [5]:
def get_trimmed_glove_vectors(filename):
    """
    Args:
        filename: path to the npz file
    Returns:
        nmatrix of embeddings (np array)
    """
    return np.load(filename)["glove"]

In [6]:
def initialize_vocab(vocab_path):
    if gfile.Exists(vocab_path):
        rev_vocab = [] 
        with tf.io.gfile.GFile(vocab_path, mode="rb") as f:
            rev_vocab.extend(f.readlines())
        rev_vocab = [line.strip(b'\n') for line in rev_vocab]
        vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)])
        return vocab, rev_vocab
    else:
        raise ValueError("Vocabulary file %s not found.", vocab_path)

In [7]:
config = Config()
train = squad_dataset(config.question_train, config.context_train, config.answer_train)
dev = squad_dataset(config.question_dev, config.context_dev, config.answer_dev)

In [8]:
import re
def clear_dictonary(vocab):
    dict_1 = {}
    i=0
    for k in vocab:
        k = re.sub(r'.*b\'', '\'', str(k)).replace('\'','')
        dict_1.update({k:i})
        i+=1
    return dict_1

def clear_list(dirty_list):
    clean_list = []
    for x in dirty_list:
        clean_list.append(re.sub(r'.*b\'', '\'', str(x)).replace('\'',''))
    return clean_list 

In [9]:
embed_path = config.embed_path
vocab_path = config.vocab_path
vocab, rev_vocab = initialize_vocab(vocab_path)
vocab1 = clear_dictonary(vocab)
rev_vocab1 = clear_list(rev_vocab)
embeddings = get_trimmed_glove_vectors(embed_path)

In [10]:
def _pad_sequences(sequences, pad_tok, max_length):
    """
    Args:
        sequences: a generator of list or tuple
        pad_tok: the char to pad with
    Returns:
        a list of list where each sublist has same length
    """
    sequence_padded, sequence_length = [], []

    for seq in sequences:
        seq = list(seq)
        seq_ = seq[:max_length] + [pad_tok]*max(max_length - len(seq), 0)
        sequence_padded +=  [seq_]
        sequence_length += [min(len(seq), max_length)]

    return np.array(sequence_padded), np.array(sequence_length)

def pad_sequences(sequences, pad_tok):
    """
    Args:
        sequences: a generator of list or tuple
        pad_tok: the char to pad with
    Returns:
        a list of list where each sublist has same length
    """
    max_length = max([len(list(x)) for x in sequences])
    sequence_padded, sequence_length = _pad_sequences(sequences, 
                                            pad_tok, max_length)

    return sequence_padded, sequence_length 

In [11]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior() 

question_ids = tf.placeholder(tf.int32, shape = [None, None], name = "question_ids")
passage_ids = tf.placeholder(tf.int32, shape = [None, None], name = "passage_ids")

question_lengths = tf.placeholder(tf.int32, shape=[None], name="question_lengths")
passage_lengths = tf.placeholder(tf.int32, shape = [None], name = "passage_lengths")

labels = tf.placeholder(tf.int32, shape = [None, 2], name = "gold_labels")
dropout = tf.placeholder(tf.float32, shape=[], name = "dropout")

Instructions for updating:
non-resource variables are not supported in the long term
Instructions for updating:
non-resource variables are not supported in the long term


In [12]:
def get_feed_dict(questions, contexts, answers, dropout_val):
    """
    -arg questions: A list of list of ids representing the question sentence
    -arg contexts: A list of list of ids representing the context paragraph
    -arg dropout_val: A float representing the keep probability for dropout 

    :return: dict {placeholders: value}
    """

    padded_questions, question_length = pad_sequences(questions, 0)
    padded_contexts, passage_length = pad_sequences(contexts, 0)


    feed = {
        question_ids : padded_questions,
        passage_ids : padded_contexts,
        question_lengths : question_length,
        passage_lengths : passage_length,
        labels : answers,
        dropout : dropout_val
    }

    return feed

In [13]:
with tf.variable_scope("vocab_embeddings"):
    _word_embeddings = tf.Variable(embeddings, name="_word_embeddings", dtype=tf.float32, trainable= config.train_embeddings)
    question_emb = tf.nn.embedding_lookup(_word_embeddings, question_ids, name = "question") # (-1, Q, D)
    passage_emb = tf.nn.embedding_lookup(_word_embeddings, passage_ids, name = "passage") # (-1, P, D)
    # Apply dropout
    question = tf.nn.dropout(question_emb, config.dropout_val)
    passage  = tf.nn.dropout(passage_emb, config.dropout_val)

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [14]:
hidden_size=150

In [15]:
def encode(inputs, masks, encoder_state_input = None):
    """
    :param inputs: vector representations of question and passage (a tuple) 
    :param masks: masking sequences for both question and passage (a tuple)

    :param encoder_state_input: (Optional) pass this as initial hidden state
                                to tf.nn.dynamic_rnn to build conditional representations
    :return: an encoded representation of the question and passage.
    """


    question, passage = inputs
    masks_question, masks_passage = masks    

    # read passage conditioned upon the question
    with tf.variable_scope("encoded_question"):
        lstm_cell_fw_question = tf.nn.rnn_cell.LSTMCell(hidden_size, state_is_tuple = True)
        lstm_cell_bw_question = tf.nn.rnn_cell.LSTMCell(hidden_size, state_is_tuple = True)
        encoded_question, (q_rep, _) = tf.nn.bidirectional_dynamic_rnn(lstm_cell_fw_question, lstm_cell_bw_question, question, masks_question, dtype=tf.float32) # (-1, Q, H)

    with tf.variable_scope("encoded_passage"):
        lstm_cell_fw_passage  = tf.nn.rnn_cell.LSTMCell(hidden_size, state_is_tuple = True)
        lstm_cell_bw_passage  = tf.nn.rnn_cell.LSTMCell(hidden_size, state_is_tuple = True)
        encoded_passage, (p_rep, _) =   tf.nn.bidirectional_dynamic_rnn(lstm_cell_fw_passage, lstm_cell_bw_passage, passage, masks_passage, dtype=tf.float32) # (-1, P, H)

    # Merging both the outputs of the bi-lstm models
    encoded_question = tf.concat(axis = 2, values = encoded_question)
    encoded_passage = tf.concat(axis = 2, values = encoded_passage)

    # outputs beyond sequence lengths are masked with 0s
    return encoded_question, encoded_passage , q_rep, p_rep 

In [16]:
def _reverse(input_, seq_lengths, seq_dim, batch_dim):
    if seq_lengths is not None:
        return array_ops.reverse_sequence(
            input=input_, seq_lengths=seq_lengths,
            seq_dim=seq_dim, batch_dim=batch_dim)
    else:
        return array_ops.reverse(input_, axis=[seq_dim])

In [17]:
# Match LSTM
def run_match_lstm(encoded_rep, masks):
    encoded_question, encoded_passage = encoded_rep
    masks_question, masks_passage = masks
    
    match_lstm_cell_attention_fn = lambda curr_input, state : tf.concat([curr_input, state], axis = -1)
    query_depth = encoded_question.get_shape()[-1]

    with tf.variable_scope("match_lstm_attender"):
        attention_mechanism_match_lstm = tfa.seq2seq.BahdanauAttention(query_depth, encoded_question, memory_sequence_length = masks_question)
        cell = tf.nn.rnn_cell.LSTMCell(hidden_size, state_is_tuple = True)
        lstm_attender  =  tfa.seq2seq.AttentionWrapper(cell, attention_mechanism_match_lstm, output_attention = False, cell_input_fn = match_lstm_cell_attention_fn)
    
        # we don't mask the passage because masking the memories will be handled by the pointerNet
        reverse_encoded_passage = _reverse(encoded_passage, masks_passage, 1, 0)
    
        output_attender_fw, _ = tf.nn.dynamic_rnn(lstm_attender, encoded_passage, dtype=tf.float32, scope ="rnn")    
        output_attender_bw, _ = tf.nn.dynamic_rnn(lstm_attender, reverse_encoded_passage, dtype=tf.float32, scope = "rnn")
    
        output_attender_bw = _reverse(output_attender_bw, masks_passage, 1, 0)
    
    output_attender = tf.concat([output_attender_fw, output_attender_bw], axis = -1) # (-1, P, 2*H)
    return output_attender

In [18]:
# Answer Pointer
def run_answer_ptr(output_attender, masks, labels):
    #batch_size = tf.shape(output_attender)[0]
    masks_question, masks_passage = masks
    labels = tf.unstack(labels, axis=1) 
    
    answer_ptr_cell_input_fn = lambda curr_input, context : context # independent of question
    query_depth_answer_ptr = output_attender.get_shape()[-1]
    
    with tf.variable_scope("answer_ptr_attender"):
        attention_mechanism_answer_ptr = tfa.seq2seq.BahdanauAttention(query_depth_answer_ptr , output_attender, memory_sequence_length = masks_passage)
    
        # output attention is true because we want to output the attention values
        cell_answer_ptr = tf.nn.rnn_cell.BasicLSTMCell(hidden_size, state_is_tuple = True )
        answer_ptr_attender = tfa.seq2seq.AttentionWrapper(cell_answer_ptr, attention_mechanism_answer_ptr, cell_input_fn = answer_ptr_cell_input_fn)
        logits, _ = tf.nn.static_rnn(answer_ptr_attender, labels, dtype = tf.float32)
    
    return logits

In [19]:
# decoder
def decode(encoded_rep, q_rep, masks, labels):
    """
    takes in a knowledge representation
    and output a probability estimation over
    all paragraph tokens on which token should be
    the start of the answer span, and which should be
    the end of the answer span.

    :param knowledge_rep: it is a representation of the paragraph and question,
                          decided by how you choose to implement the encoder
    :return:
    """
    # Run match-LSTM + Ans-Ptr
    output_attender = run_match_lstm(encoded_rep, masks)
    logits = run_answer_ptr(output_attender, masks, labels)
    
    return logits

In [20]:
# setup_system
encoded_question, encoded_passage, q_rep, p_rep = encode([question,passage], [question_lengths,passage_lengths],encoder_state_input = None)
encoded_rep = encoded_question, encoded_passage
masks = question_lengths,passage_lengths
logits = decode(encoded_rep, q_rep, masks, labels)

# setup_loss
losses= tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits[0], labels=labels[:,0])
losses+= tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits[1], labels=labels[:,1])
loss = tf.reduce_mean(losses)


Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell))`, which is equivalent to this API
Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell))`, which is equivalent to this API
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Please use `layer.add_weight` method instead.
Instructions for updating:
Please use `layer.add_weight` method instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initia