# Imports

In [None]:
import random
import sys
import numpy as np
import tensorflow.compat.v1 as tf_compat_v1
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.compat.v1.nn.rnn_cell import GRUCell, MultiRNNCell
import os
from functools import reduce
from tensorflow.python.platform import gfile
import csv
import time
import pandas as pd

In [None]:
tf_compat_v1.disable_eager_execution()

# Global Parameters

## Hyper-parameters

In [None]:
# Number of words to keep
vocab_size = 100000

## Model hyperparameters:

# Output space of GRU
hidden_size = 384
embedding_dim = 200

batch_size = 64
num_epoches = 100
dropout_rate = 0.0

# A Boolean to indicate the usage of 2 text enconding layers
two_encoding_layers=False

# A Boolean to indicate the usage of Language Model (e.g., BERT) embeddings
LM_embedding = False
# BERT's layer from which embeddings are extracted
embedding_type = "First"

optimizer = "ADAM"
beta1 = 0.9
beta2 = 0.999
learning_rate = 0.001
grad_clipping = 10
_EPSILON = 10e-8

# Parameters used for early stopping, val = validation, acc = accuracy
last_val_acc = 0.0
last_val_loss = sys.maxsize

# End of Sentence ID, it is not important
EoS_ID = 2

## Paths to data

In [None]:
# Directory to data
data_type = "CN"
data_dir = "./" + "data/" + data_type + "_data/"

# File names
train_file = data_type + "_train"
valid_file = data_type + "_valid_2000ex"
test_file = data_type + "_test_2500ex"

# Directory to processed data (e.g., ID files)
output_dir = "processedData"

# Directory to check points
weight_path = "./" + 'check_points/'

# Directory to pre-trained Glove
embedding_file = "./" + "embeddings/glove.6B.200d.txt"

# Read Data

This section includes methods to read and prepare data. The methods are based on methods on the follwoing repository: https://github.com/cairoHy/attention-sum-reader

In [None]:
def paths_to_processed_data(data_dir, train_file, valid_file, test_file, vocab_size, output_dir):
  """
  This method takes paths to all data files and generates paths to corresponding ID files
  """

  idx_train_file = os.path.join(data_dir, output_dir, train_file + ".%d.id.txt" % vocab_size)
  idx_valid_file = os.path.join(data_dir, output_dir, valid_file + ".%d.id.txt" % vocab_size)
  idx_test_file = os.path.join(data_dir, output_dir, test_file + ".%d.id.txt" % vocab_size)
  vocab_file = os.path.join(data_dir, output_dir, "vocab.%d.txt" % vocab_size)

  return vocab_file, idx_train_file, idx_valid_file, idx_test_file

def read_processed_cbt_data(file):
  """
  This method Reads a data file
  """

  documents, questions, answers, candidates = [], [], [], []
    
  with gfile.GFile(file, mode="r") as f:
    counter = 0
    d, q, a, A = [], [], [], []
    for line in f:
      counter += 1
      
      if counter % 100000 == 0:
        print("Reading line %d in %s" % (counter, file))

      if counter % 22 == 21:
        
        tmp = line.strip().split("\t")
        q = tmp[0].split(" ") + [EoS_ID]
        a = [1 if tmp[1] == i else 0 for i in d]

        A = [a for a in tmp[2].split("|")]

        # Put the correct answer first
        A.remove(tmp[1])
        A.insert(0, tmp[1])

      elif counter % 22 == 0:
        documents.append(d)
        questions.append(q)
        answers.append(a)
        candidates.append(A)
        
        d, q, a, A = [], [], [], []

      else:
        # Add EoS ID at the end of each sentence
        d_tem = [i for i in line.strip().split(" ") if i != '']
        d.extend(d_tem + [EoS_ID])

  d_lens = [len(i) for i in documents]
  q_lens = [len(i) for i in questions]

  avg_d_len = reduce(lambda x, y: x + y, d_lens) / len(documents)
  print("Document average length: %d." % avg_d_len)
  print("Document midden length: %d." % len(sorted(documents, key=len)[len(documents) // 2]))

  avg_q_len = reduce(lambda x, y: x + y, q_lens) / len(questions)
  print("Question average length: %d." % avg_q_len)
  print("Question midden length: %d." % len(sorted(questions, key=len)[len(questions) // 2]))
  return documents, questions, answers, candidates

In [None]:
def load_vocab(vocab_file):
  """
  This method loads 'vocab_file'. It returns a 'word_dict' with entries as {word : its ID}

  Parameters:
    'vocab_file': Path to vocab file
  """

  if not gfile.Exists(vocab_file):
    raise ValueError("Vocabulary file %s not found.", vocab_file)

  word_dict = {}
  word_id = 0

  with gfile.GFile(vocab_file, "r") as f:
    for line in f:

      # Line has a single word with trailing new line char which needs to be removed
      word_dict.update({line.strip(): word_id})
      word_id += 1

  return word_dict

In [None]:
def gen_embeddings(word_dict, embed_dim, in_file=None, init=np.zeros):
    """
    Create an initialized word vector matrix for the vocabulary. If a word is not in the word vector file, 
      a vector will be initialized randomly.
    
    :param word_dict: Word to id mapping
    :param embed_dim: The dimensions of the word vector.
    :param in_file: Pre-trained word vector file. 
    :param init: How to initialize words not found in the pre-training file
    :return: Word vector matrix
    """
    num_words = max(word_dict.values()) + 1
    embedding_matrix = init(-0.1, 0.1, (num_words, embed_dim))
    print('Embeddings: %d x %d' % (num_words, embed_dim))

    if not in_file:
        return embedding_matrix

    assert get_dim(in_file) == embed_dim
    print('Loading embedding file: %s' % in_file)

    pre_trained = 0
    for line in open(in_file):
        sp = line.split()
        if sp[0] in word_dict:
            pre_trained += 1
            embedding_matrix[word_dict[sp[0]]] = np.asarray([float(x) for x in sp[1:]], dtype=np.float32)

    print('Pre-trained: %d (%.2f%%)' %
                 (pre_trained, pre_trained * 100.0 / num_words))
    return embedding_matrix


def get_dim(in_file):
    """
    This method gets dimension of stored vectors
    """
    line = gfile.GFile(in_file, mode='r').readline()
    return len(line.split()) - 1


def get_max_length(d_bt):
    lens = [len(i) for i in d_bt]
    return max(lens)

# Model

The model class is based on https://github.com/cairoHy/attention-sum-reader/blob/master/as_reader_tf.py

In [None]:
class HAN_model(object):
    def __init__(self, word_dict, embedding_matrix,
                 d_len, q_len, sess, embedding_dim,
                 hidden_size, weight_path,
                 use_lstm=False, two_encoding_layers=False):
      
        self.weight_path = weight_path
        self.word_dict = word_dict
        self.vocab_size = len(embedding_matrix)
        self.d_len = d_len
        self.q_len = q_len
        self.sess = sess
        self.two_encoding_layers = two_encoding_layers

        self.A_len = 10

        # Prepare embeddings
        with tf.device("/cpu:0"):
            self.embedding = tf.Variable(initial_value=embedding_matrix, trainable=False,
                                    name="embedding_matrix_w",
                                    dtype="float32")
            
        print("Embedding matrix shape:%d x %d" % (len(embedding_matrix), embedding_dim))
        
        self.text_encoder = GRUCell(num_units=hidden_size)

        # Model input and output
        self.q_input = tf_compat_v1.placeholder(dtype=tf.int32, shape=(None, self.q_len), name="q_input")
        self.d_input = tf_compat_v1.placeholder(dtype=tf.int32, shape=(None, self.d_len), name="d_input")
        self.context_mask_bt = tf_compat_v1.placeholder(dtype=tf.float32, shape=(None, self.d_len), name="context_mask_bt")
        self.candidates_bi = tf_compat_v1.placeholder(dtype=tf.int32, shape=(None, self.A_len), name="candidates_bi")
        self.y_true = tf_compat_v1.placeholder(shape=(None, self.A_len), dtype=tf.float32, name="y_true")


    def att_dot(self, x):
        """Attention point multiplication function"""
        d_btf, q_bf = x
        
        # 'res' shape = (None, 1, max_d_length)
        res = K.batch_dot(tf.expand_dims(q_bf, -1), d_btf, (1, 2))
        return tf.reshape(res, [-1, self.d_len])

    # Attention-sum process
    def sum_prob_of_word(self, word_ix, sentence_ixs, sentence_attention_probs):
        word_ixs_in_sentence = tf.where(tf.equal(sentence_ixs, word_ix))
        return tf.reduce_sum(tf.gather(sentence_attention_probs, word_ixs_in_sentence))

    # noinspection PyUnusedLocal
    def sum_probs_single_sentence(self, prev, cur):
        candidate_indices_i, sentence_ixs_t, sentence_attention_probs_t = cur
        result = tf.scan(
            fn=lambda previous, x: self.sum_prob_of_word(x, sentence_ixs_t, sentence_attention_probs_t),
            elems=[candidate_indices_i],
            initializer=tf.constant(0., dtype="float32"))
        return result

    def sum_probs_batch(self, candidate_indices_bi, sentence_ixs_bt, sentence_attention_probs_bt):
        result = tf.scan(
            fn=self.sum_probs_single_sentence,
            elems=[candidate_indices_bi, sentence_ixs_bt, sentence_attention_probs_bt],
            initializer=tf.Variable([0] * self.A_len, dtype="float32"))
        return result

    def build_model(self):
        
        """
        'y = sign(x) = -1 if x < 0; 0 if x == 0; 1 if x > 0.'
        'tf.abs' = absolute
        """
        d_lens = tf.reduce_sum(tf.sign(tf.abs(self.d_input)), 1)
        q_lens = tf.reduce_sum(tf.sign(tf.abs(self.q_input)), 1)

        # Query encoder
        with tf_compat_v1.variable_scope('q_encoder', initializer=tf.initializers.GlorotUniform(), reuse=tf_compat_v1.AUTO_REUSE):

            # output shape: (None, max_q_length, embedding_dim), max_q_length = max query length
            q_embed = tf.nn.embedding_lookup(self.embedding, self.q_input)

            q_cell = MultiRNNCell(cells=[self.text_encoder] * 1)

            """            
            Takes input and builds independent forward and backward RNNs. The input_size of forward and backward cell must match. 
              The initial state for both directions is zero by default (but can be set optionally) and no intermediate states are ever returned -- 
                the network is fully unrolled for the given (passed in) length(s) of the sequence(s) or completely unrolled if length(s) is not given.

            Outputs:
            A tuple (outputs, output_states) where: outputs: A tuple (output_fw, output_bw), and output_states: A tuple (output_state_fw, output_state_bw) 
              containing the forward and the backward final states of bidirectional rnn.

            """
            outputs, last_states = tf_compat_v1.nn.bidirectional_dynamic_rnn(cell_bw=q_cell,
                                                                   cell_fw=q_cell,
                                                                   dtype="float32",
                                                                   sequence_length=q_lens,
                                                                   inputs=q_embed,
                                                                   swap_memory=True)
            # 'q_encode' output shape: (None, hidden_size * 2)
            """
            Concatenate along the last axis so that enties at 'last_states[0]' 
              gets concatenated with the corresponding entries at 'last_states[0]'
              see https://www.tensorflow.org/api_docs/python/tf/concat
            """
            self.q_encode = tf.concat([last_states[0][-1], last_states[1][-1]], axis=-1)

        # Level 1 (L1) document encoder
        with tf_compat_v1.variable_scope('d_encoder_L1', initializer=tf.initializers.GlorotUniform(), reuse=tf_compat_v1.AUTO_REUSE):

            # output shape: (None, max_d_length, embedding_dim),  max_d_length = max document length
            d_embed_L1 = tf.nn.embedding_lookup(self.embedding, self.d_input)

            d_cell_L1 = MultiRNNCell(cells=[self.text_encoder] * 1)
            outputs_L1, last_states_L1 = tf_compat_v1.nn.bidirectional_dynamic_rnn(cell_bw=d_cell_L1,
                                                                   cell_fw=d_cell_L1,
                                                                   dtype="float32",
                                                                   sequence_length=d_lens,
                                                                   inputs=d_embed_L1,
                                                                   swap_memory=True)
            
            # 'd_encode_L1' output shape: (None, max_d_length, hidden_size * 2)
            self.d_encode_L1 = tf.concat(outputs_L1, axis=-1)

        # Level 1 (L1) attention
        with tf_compat_v1.variable_scope('attention_L1', reuse=tf_compat_v1.AUTO_REUSE):

            # Output shape = (None, max_d_length)
            self.attention_L1 = self.att_dot([self.d_encode_L1, self.q_encode])
            self.attention_softmax_L1 = tf.nn.softmax(logits=self.attention_L1, name="attention_softmax_L1")

            # Compute attented document
            # Shape = (None, max_d_length, hidden_size * 2)
            self.attented_doc = tf.multiply(tf.expand_dims(self.attention_softmax_L1, -1), self.d_encode_L1, name="attented_doc")

        if(self.two_encoding_layers):
            # Level 2 (L2) document encoder
            with tf_compat_v1.variable_scope('d_encoder_L2', initializer=tf.initializers.GlorotUniform(), reuse=tf_compat_v1.AUTO_REUSE):

                d_cell_L2 = MultiRNNCell(cells=[self.text_encoder] * 1)
                outputs_L2, last_states_L2 = tf_compat_v1.nn.bidirectional_dynamic_rnn(cell_bw=d_cell_L2,
                                                                      cell_fw=d_cell_L2,
                                                                      dtype="float32",
                                                                      sequence_length=d_lens,
                                                                      inputs=self.attented_doc,
                                                                      swap_memory=True)
                
                # 'd_encode_L2' output shape: (None, max_d_length, hidden_size * 2)
                self.d_encode_L2 = tf.concat(outputs_L2, axis=-1)

        # Level 2 (L2) attention
        with tf_compat_v1.variable_scope('attention_L2', reuse=tf_compat_v1.AUTO_REUSE):
          
            # Output shape = (None, max_d_length)
            self.attention_L2 = self.att_dot([self.d_encode_L2, self.q_encode]) if self.two_encoding_layers else self.att_dot([self.attented_doc, self.q_encode])
            
            self.attention_softmax_L2 = tf.nn.softmax(logits=self.attention_L2, name="softmax_attention_L2")

            # Output shape = (None, max_d_length)
            self.last_prob = tf.multiply(self.attention_softmax_L2, self.attention_softmax_L1, name="last_prob")
        

        # Attention summation
        # Output shape = (None, i) where i = max_candidate_length = 10
        self.y_hat = self.sum_probs_batch(self.candidates_bi, self.d_input, self.last_prob)

        # Cross entropy loss function
        output = self.y_hat / tf.reduce_sum(self.y_hat,
                                            axis=len(self.y_hat.get_shape()) - 1,
                                            keepdims=True)
        
        # Compute crossentropy
        epsilon = tf.convert_to_tensor(_EPSILON, output.dtype.base_dtype, name="epsilon")
        output = tf.clip_by_value(output, epsilon, 1. - epsilon)
        self.loss = tf.reduce_mean(- tf.reduce_sum(self.y_true * tf.math.log(output),
                                                   axis=len(output.get_shape()) - 1))
        
        # Calculate accuracy
        self.correct_prediction = tf.reduce_sum(tf.sign(tf.cast(tf.equal(tf.argmax(self.y_hat, 1),
                                                                         tf.argmax(self.y_true, 1)), "float")))
        # Model serialization tool
        self.saver = tf_compat_v1.train.Saver()


    def train(self, train_data, valid_data, batch_size, epochs, opt_name, lr, beta1, beta2, grad_clip):
        """
        Model training
        """
        # Preprocessing the input
        questions_ok, documents_ok, context_mask, candidates_ok, y_true = self.preprocess_input_sequences(train_data)
        v_questions, v_documents, v_context_mask, v_candidates, v_y_true = self.preprocess_input_sequences(valid_data)

        # Define the optimization method of the model
        if opt_name == "SGD":
            optimizer = tf_compat_v1.train.GradientDescentOptimizer(learning_rate=lr)
        elif opt_name == "ADAM":
            optimizer = tf_compat_v1.train.AdamOptimizer(learning_rate=lr, beta1=beta1, beta2=beta2)
        else:
            raise NotImplementedError("Other Optimizer Not Implemented.-_-||")

        # Gradient cropping
        grad_vars = optimizer.compute_gradients(self.loss)
        grad_vars = [
            (tf.clip_by_norm(grad, grad_clip), var)
            if grad is not None else (grad, var)
            for grad, var in grad_vars]
        train_op = optimizer.apply_gradients(grad_vars)
        self.sess.run(tf_compat_v1.global_variables_initializer())

        # Load a previously trained model
        self.load_weight()

        # Prepare validation set data
        v_data = {self.q_input: v_questions,
                  self.d_input: v_documents,
                  self.context_mask_bt: v_context_mask,
                  self.candidates_bi: v_candidates,
                  self.y_true: v_y_true}

        # early stopping parameter
        best_val_loss, best_val_acc, patience, lose_times = last_val_loss, last_val_acc, 5, 0

        # Start training
        corrects_in_epoch, loss_in_epoch = 0, 0
        batch_num, v_batch_num = len(questions_ok) // batch_size, len(v_questions) // batch_size
        batch_idx, v_batch_idx = np.random.permutation(batch_num), np.arange(v_batch_num)
        print("Train on {} batches, {} samples per batch.".format(batch_num, batch_size))
        print("Validate on {} batches, {} samples per batch.".format(v_batch_num, batch_size))

        for step in range(batch_num * epochs):
            # End of an Epoch, output log and shuffle
            if step % batch_num == 0:
                corrects_in_epoch, loss_in_epoch = 0, 0
                print("--------Epoch : {}".format(step // batch_num + 1))
                np.random.shuffle(batch_idx)

            # Get the data for the next batch
            _slice = np.index_exp[
                     batch_idx[step % batch_num] * batch_size:(batch_idx[step % batch_num] + 1) * batch_size]
            data = {self.q_input: questions_ok[_slice],
                    self.d_input: documents_ok[_slice],
                    self.context_mask_bt: context_mask[_slice],
                    self.candidates_bi: candidates_ok[_slice],
                    self.y_true: y_true[_slice]}

            # Train, update parameters, output current accuracy of Epoch
            start_time = time.time()
            loss_, _, corrects_in_batch = self.sess.run([self.loss, train_op, self.correct_prediction],
                                                        feed_dict=data)
            
            corrects_in_epoch += corrects_in_batch
            loss_in_epoch += loss_ * batch_size
            nums_in_epoch = (step % batch_num + 1) * batch_size
            print("Trained samples in this epoch : {}".format(nums_in_epoch))
            print("Step : {}/{}.\nLoss : {:.4f}.\nAccuracy : {:.4f}".format(step % batch_num,
                                                                                   batch_num,
                                                                                   loss_in_epoch / nums_in_epoch,
                                                                                   corrects_in_epoch / nums_in_epoch))

            # Save the model every 200 steps and use the validation set to calculate the accuracy rate and determine whether it is early stop
            if step % 200 == 0 and step != 0:
                # Due to insufficient GPU memory, it is still calculated as batch
                val_num, val_corrects, v_loss = 0, 0, 0
                for i in range(v_batch_num):
                    start = v_batch_idx[i % v_batch_num] * batch_size
                    stop = (v_batch_idx[i % v_batch_num] + 1) * batch_size
                    _v_slice = np.index_exp[start:stop]
                    v_data = {self.q_input: v_questions[_v_slice],
                              self.d_input: v_documents[_v_slice],
                              self.context_mask_bt: v_context_mask[_v_slice],
                              self.candidates_bi: v_candidates[_v_slice],
                              self.y_true: v_y_true[_v_slice]}
                    loss_, v_correct = self.sess.run([self.loss, self.correct_prediction], feed_dict=v_data)
                    val_num = val_num + batch_size
                    val_corrects = val_corrects + v_correct
                    v_loss = v_loss + loss_ * batch_size

                val_acc = val_corrects / val_num
                val_loss = v_loss / val_num
                print("Val acc : {:.4f}".format(val_acc))
                print("Val Loss : {:.4f}".format(val_loss))

                if val_acc > best_val_acc or val_loss < best_val_loss:
                    # Save a better model
                    lose_times = 0
                    
                    best_val_loss = val_loss if val_loss < best_val_loss else best_val_loss
                    best_val_acc = val_acc if val_acc > best_val_acc else best_val_acc
                    path = self.saver.save(self.sess,
                                           self.weight_path + \
                                           'machine_reading-val_acc-{:.4f}-val_loss-{:.4f}.model'.format(val_acc, val_loss),
                                           global_step=step)
                    print("Save model to {}.".format(path))

                else:
                    lose_times += 1
                    print("Lose_time/Patience : {}/{} .".format(lose_times, patience))
                    if lose_times >= patience:
                        print("Oh u, stop training.".format(lose_times, patience))
                        exit(0)

    def test(self, test_data, batch_size):
        
        questions_ok, documents_ok, context_mask, candidates_ok, y_true = self.preprocess_input_sequences(test_data)
        print("Test on {} samples, {} per batch.".format(len(questions_ok), batch_size))

        # Load a previously trained model
        self.load_weight()
        
        # Accuarcy
        Accuracies = []
        delays = []
        
        # Testing
        batch_num = len(questions_ok) // batch_size
        batch_idx = np.arange(batch_num)
        correct_num, total_num = 0, 0
        for i in range(batch_num):
            start = batch_idx[i % batch_num] * batch_size
            stop = (batch_idx[i % batch_num] + 1) * batch_size
            _slice = np.index_exp[start:stop]
            data = {self.q_input: questions_ok[_slice],
                    self.d_input: documents_ok[_slice],
                    self.context_mask_bt: context_mask[_slice],
                    self.candidates_bi: candidates_ok[_slice],
                    self.y_true: y_true[_slice]}
            
            start = time.time()  
            correct, = self.sess.run([self.correct_prediction], feed_dict=data)
            end = time.time() - start
            delays.append(end / batch_size)
            
            correct_num, total_num = correct_num + correct, total_num + batch_size
            Accuracies.append(correct / batch_size)
            
        test_acc = correct_num / total_num
        print("Test accuracy is : {:.5f}".format(test_acc))
        
        return Accuracies, test_acc, delays

    def load_weight(self):

        ckpt = tf.train.get_checkpoint_state(self.weight_path)
        if ckpt is not None:
            print("Load model from {}.".format(ckpt.model_checkpoint_path))
            self.saver.restore(self.sess, ckpt.model_checkpoint_path)
        else:
            print("No previous models.")

    @staticmethod
    def union_shuffle(data):
        d, q, a, A = data
        c = list(zip(d, q, a, A))
        random.shuffle(c)
        return zip(*c)

    def preprocess_input_sequences(self, data, shuffle=True):
        """
        Preprocessing input as：
          shuffle
          PAD To a fixed-length sequence
          y_true is a vector of length self.A_len, index = 0 is the correct answer, and one-hot encoding
        """
        documents, questions, answer, candidates = self.union_shuffle(data) if shuffle else data
        d_lens = [len(i) for i in documents]

        questions_ok = pad_sequences(questions, maxlen=self.q_len, dtype="int32", padding="post", truncating="post")
        documents_ok = pad_sequences(documents, maxlen=self.d_len, dtype="int32", padding="post", truncating="post")
        context_mask = K.eval(tf.sequence_mask(d_lens, self.d_len, dtype=tf.float32))
        candidates_ok = pad_sequences(candidates, maxlen=self.A_len, dtype="int32", padding="post", truncating="post")
        y_true = np.zeros_like(candidates_ok)
        y_true[:, 0] = 1
        return questions_ok, documents_ok, context_mask, candidates_ok, y_true

# Run

## Data Preparation

In [None]:
vocab_file, idx_train_file, idx_valid_file, idx_test_file = paths_to_processed_data(
        data_dir, train_file, valid_file, test_file, vocab_size, output_dir)

In [None]:
t_documents, t_questions, t_answer, t_candidates = read_processed_cbt_data(idx_train_file)
v_documents, v_questions, v_answers, v_candidates = read_processed_cbt_data(idx_valid_file)
test_documents, test_questions, test_answers, test_candidates = read_processed_cbt_data(idx_test_file)

d_len = get_max_length(t_documents)
q_len = get_max_length(t_questions)

Reading line 100000 in /content/drive/My Drive/question-answering-data/CNN_data/processedData/CNN_train.100000.id.txt
Reading line 200000 in /content/drive/My Drive/question-answering-data/CNN_data/processedData/CNN_train.100000.id.txt
Reading line 300000 in /content/drive/My Drive/question-answering-data/CNN_data/processedData/CNN_train.100000.id.txt
Reading line 400000 in /content/drive/My Drive/question-answering-data/CNN_data/processedData/CNN_train.100000.id.txt
Reading line 500000 in /content/drive/My Drive/question-answering-data/CNN_data/processedData/CNN_train.100000.id.txt
Reading line 600000 in /content/drive/My Drive/question-answering-data/CNN_data/processedData/CNN_train.100000.id.txt
Reading line 700000 in /content/drive/My Drive/question-answering-data/CNN_data/processedData/CNN_train.100000.id.txt
Reading line 800000 in /content/drive/My Drive/question-answering-data/CNN_data/processedData/CNN_train.100000.id.txt
Reading line 900000 in /content/drive/My Drive/question-

## Get embeddings

In [None]:
# Initialize the word vector matrix, using a random uniform distribution in the interval (-0.1,0.1)
word_dict = load_vocab(vocab_file)
embedding_matrix = gen_embeddings(word_dict,
                                              embedding_dim,
                                              embedding_file,
                                              init=np.random.uniform)

Embeddings: 41964 x 200
Loading embedding file: /content/drive/My Drive/question-answering-embeddings/glove.6B.200d.txt
Pre-trained: 37427 (89.19%)


## Define Model

In [None]:
sess = tf_compat_v1.Session()
model = HAN_model(word_dict, embedding_matrix, d_len, q_len, sess,
                              embedding_dim, hidden_size,
                              weight_path, two_encoding_layers)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Embedding matrix shape:41964 x 200
Instructions for updating:
This class is equivalent as tf.keras.layers.GRUCell, and will be replaced by that in Tensorflow 2.0.


In [None]:
model.build_model()

Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell))`, which is equivalent to this API
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Please use `layer.add_weight` method instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


## Training

In [None]:
model.train(train_data=(t_documents, t_questions, t_answer, t_candidates),
                    valid_data=(v_documents, v_questions, v_answers, v_candidates),
                    batch_size=batch_size,
                    epochs=num_epoches,
                    opt_name=optimizer,
                    lr=learning_rate,
                    beta1=beta1,
                    beta2=beta2,
                    grad_clip=grad_clipping)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Loss : 7.6879.
Accuracy : 0.4210
Trained samples in this epoch : 29888
Step : 466/794.
Loss : 7.6880.
Accuracy : 0.4210
Trained samples in this epoch : 29952
Step : 467/794.
Loss : 7.6863.
Accuracy : 0.4210
Trained samples in this epoch : 30016
Step : 468/794.
Loss : 7.6859.
Accuracy : 0.4211
Trained samples in this epoch : 30080
Step : 469/794.
Loss : 7.6806.
Accuracy : 0.4215
Trained samples in this epoch : 30144
Step : 470/794.
Loss : 7.6840.
Accuracy : 0.4214
Trained samples in this epoch : 30208
Step : 471/794.
Loss : 7.6819.
Accuracy : 0.4215
Trained samples in this epoch : 30272
Step : 472/794.
Loss : 7.6788.
Accuracy : 0.4217
Trained samples in this epoch : 30336
Step : 473/794.
Loss : 7.6776.
Accuracy : 0.4217
Trained samples in this epoch : 30400
Step : 474/794.
Loss : 7.6808.
Accuracy : 0.4215
Trained samples in this epoch : 30464
Step : 475/794.
Loss : 7.6813.
Accuracy : 0.4215
Trained samples in this epoch : 

## Testing

In [None]:
start_t = time.time()
batch_accuracies, test_accuracy, batch_delays = model.test((test_documents, test_questions, test_answers, test_candidates), batch_size)
start_t = time.time() - start_t
start_t