<a href="https://colab.research.google.com/github/CorentinMAG/NLP/blob/main/BIDAF/bidaf_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

a very good explanation of the BIDAF architecture : 

https://towardsdatascience.com/the-definitive-guide-to-bi-directional-attention-flow-d0e96e9e666b

character embedding with CNN :

https://towardsdatascience.com/besides-word-embedding-why-you-need-to-know-character-embedding-6096a34a3b10
https://github.com/makcedward/nlp/blob/master/sample/nlp-character_embedding.ipynb

To run this notebook you should have run the bidaf_preprocessing one.  
You should as well modify all paths

In [None]:
import tensorflow as tf
import pandas as pd
import os
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Bidirectional, TimeDistributed
from tensorflow.keras.layers import Layer, Softmax, Concatenate, Dropout, Conv1D, GlobalMaxPooling1D
from tensorflow.keras.backend import batch_dot
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K
tf.get_logger().setLevel('INFO')
from tqdm import tqdm
import numpy as np
import pickle
import nltk
nltk.download('punkt')
import gensim.downloader as gloader
import math

try:
  from utils.datasets import SQUAD_dataset
except:
  import sys
  sys.path.append(os.path.join(os.getcwd(),'drive/MyDrive/NLP/BIDAF'))
  from utils.datasets import SQUAD_dataset

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
path_word_tokenizer = os.path.join(os.getcwd(),'drive/MyDrive/NLP/BIDAF/utils', 'tokenizers/word_tokenizer.pkl')
with open(path_word_tokenizer, 'rb') as handle:
  tokenizer = pickle.load(handle)

path_char_tokenizer = os.path.join(os.getcwd(), 'drive/MyDrive/NLP/BIDAF/utils', 'tokenizers/char_tokenizer.pkl')
with open(path_char_tokenizer, 'rb') as char_handle:
  char_tokenizer = pickle.load(char_handle)

In [None]:
train_dataset = SQUAD_dataset.from_file('drive/MyDrive/NLP/BIDAF/utils/datasets/train_dataset.pkl')
valid_dataset = SQUAD_dataset.from_file('drive/MyDrive/NLP/BIDAF/utils/datasets/valid_dataset.pkl')

In [None]:
train_dataset

SQUAD_dataset : questions : (10, 25), contexts : (10, 400), char_questions : (10, 25, 15), char_contexts : (10, 400, 15)

In [None]:
print(len(train_dataset))
len(valid_dataset)

6961


1742

In [None]:
QUESTION_MAXLEN = 25
CONTEXT_MAXLEN = 400
EMBEDDING_SIZE = 300 # we can try different embedding size (50, 100, 300) or even try word2vec or fastext instead of glove
WORD_VOCAB_LEN = len(tokenizer.word_index) + 1 # +1 for the pad token
BATCH_SIZE = 10
EPOCHS = 10
CHAR_VOCAB_LEN = char_tokenizer.num_words # PAD token and UNK token included
WORD_MAXLEN = 15
LR = 0.0005
N_FILTERS = EMBEDDING_SIZE
FILTER_SIZE = 3
CHAR_EMBEDDING_SIZE = 8

In [None]:
def download_glove_embedding(embedding_dimension = 50):
  download_path = 'glove-wiki-gigaword-{}'.format(embedding_dimension)
  try:
    emb_model = gloader.load(download_path)
  except ValueError as e:
      print('Glove: 50, 100, 200, 300')
      raise e
  return emb_model

def build_embedding_matrix(tokenizer,glove_model = None):

  if glove_model == None:
    glove_model = download_glove_embedding(EMBEDDING_SIZE)

  embedding_matrix = np.zeros((WORD_VOCAB_LEN, EMBEDDING_SIZE))

  for w,i in tokenizer.word_index.items():

    if w in glove_model.vocab:
      embedding_matrix[i,:] = glove_model.get_vector(w)
    else:
      embedding_matrix[i,:] = np.random.randn(1, EMBEDDING_SIZE)

  del glove_model # we don't need it anymore

  return embedding_matrix

def build_char_embedding_matrix(char_tokenizer):

  char_embedding_matrix = np.zeros((CHAR_VOCAB_LEN,CHAR_VOCAB_LEN - 1))  # we have 199 characters that we have to one hot so each character has 199 dimensions

  for char, i in char_tokenizer.word_index.items():
    if i <= 199:
      char_embedding_matrix[i][i - 1] = 1
    else:
      break
  return char_embedding_matrix

We build the embedding matrix.  
We can also initialize a char_embedding_matrix, or we can let the model learn these embeddings.

In [None]:
embedding_matrix = build_embedding_matrix(tokenizer)

# instead of one hot encode char tokens maybe we can use glove or randomly fill the matrix
# these embeddings should be trainable
# https://github.com/minimaxir/char-embeddings
#char_embedding_matrix = build_char_embedding_matrix(char_tokenizer)



Then we define all layers of our model

In [None]:
# utils/layers
class WordEmbedding(Layer):
    
    def __init__(self, input_dim, output_dim, input_len, embedding_matrix, trainable = False, mask_zero = True, **kwargs):
        
        super(WordEmbedding, self).__init__(**kwargs)

        self.input_dim = input_dim
        self.output_dim = output_dim
        self.input_len = input_len
        self.embedding_matrix = embedding_matrix
        self.trainable = trainable
        self.mask_zero = mask_zero

        self.word_embed = Embedding(
            input_dim = self.input_dim,
            output_dim = self.output_dim,
            weights = [self.embedding_matrix],
            trainable = self.trainable,
            input_length = self.input_len,
            mask_zero = self.mask_zero,
        )

    def build(self, input_shape):
      self.built = True

    def call(self, inputs):
        input = inputs
        return self.word_embed(input) 
    
    # inplement this method in order to get a serializable layer as part of a Functional model
    def get_config(self):
        # the base Layer class takes some keywords arguments like name and dtype, it is good to include 
        # them in the config (so we call the parent method and use the update method)
        config = super().get_config().copy()
        config.update({
            'input_dim': self.input_dim,
            'output_dim': self.output_dim,
            'input_len': self.input_len, 
            'trainable': self.trainable,
            'mask_zero': self.mask_zero
        })
        return config

    @classmethod
    def from_config(cls, config):
      return cls(**config)

In [None]:
# utils/layers
class CharEmbedding(Layer):
    
    def __init__(self, input_dim, output_dim, input_len, **kwargs):
        
        super(CharEmbedding, self).__init__(**kwargs)

        self.input_dim = input_dim
        self.output_dim = output_dim
        self.input_len = input_len
        self.char_embed = Embedding(
            input_dim = self.input_dim, 
            output_dim = self.output_dim,  
            input_length = self.input_len
        )
        # This wrapper allows to apply a layer to every temporal slice of an input.
        # so we apply the same Embedding to every timestep (index 1) independently
        self.timed = TimeDistributed(self.char_embed)
        

    def build(self, input_shape):
        self.built = True

    def call(self, inputs):
        return self.timed(inputs)
            
    def get_config(self):

        config = super().get_config().copy()
        config.update({
            'input_dim': self.input_dim,
            'output_dim': self.output_dim,
            'input_len': self.input_len, 
        })
        return config

    @classmethod
    def from_config(cls, config):
      return cls(**config)

In [None]:
# utils/layers
class CharCNN(Layer):
    
    def __init__(self, n_filters, filter_width, **kwargs):
        
        super(CharCNN, self).__init__(**kwargs)
        self.n_filters = n_filters
        self.filter_width = filter_width
        self.conv = Conv1D(self.n_filters, self.filter_width)
        self.pool = GlobalMaxPooling1D()
        self.timed = TimeDistributed(self.pool)
          
    def build(self, input_shape):
        self.built = True

    def call(self, inputs):
        return self.timed(self.conv(inputs))
    
    def get_config(self):

        config = super().get_config().copy()
        config.update({
            'n_filters': self.n_filters,
            'filter_width': self.filter_width, 
        })
        return config

    @classmethod
    def from_config(cls, config):
      return cls(**config)

In [None]:
# utils/layers
class HighwayNetwork(Layer):
    
    def __init__(self, hidden_size, **kwargs):
        
        super(HighwayNetwork, self).__init__(**kwargs)
        self.hidden_size = hidden_size
        self.normal = Dense(self.hidden_size, activation = 'relu') 
        self.transform_gate = Dense(self.hidden_size, activation = 'sigmoid')
        
    def build(self, input_shape):
        self.built = True

    def call(self, inputs):        
        
        n = self.normal(inputs)
        g = self.transform_gate(inputs)
        x = g*n + (1-g)*inputs 
        return x

    def get_config(self):

        config = super().get_config().copy()
        config.update({
            'hidden_size': self.hidden_size, 
        })
        return config

    @classmethod
    def from_config(cls, config):
      return cls(**config)

In [None]:
# utils/layers
class ContextualEmbedding(Layer):
    
    def __init__(self, output_dim, **kwargs):
        
        super(ContextualEmbedding, self).__init__(**kwargs)
        self.output_dim = output_dim
        self.contextual = Bidirectional(LSTM(self.output_dim, return_sequences = True, dropout = 0.2))

    def build(self, input_shape):
        self.built = True 

    def call(self, inputs):
        return self.contextual(inputs)
    
    def get_config(self):

        config = super().get_config().copy()
        config.update({
            'output_dim': self.output_dim,
        })
        return config
    
    @classmethod
    def from_config(cls, config):
      return cls(**config)

In [None]:
# utils/layers
class Modelling(Layer):
    
    def __init__(self, output_dim, **kwargs):
        
        super(Modelling, self).__init__(**kwargs)
        self.output_dim = output_dim
        self.modelling1 = Bidirectional(LSTM(self.output_dim, return_sequences = True, dropout = 0.2))
        self.modelling2 = Bidirectional(LSTM(self.output_dim, return_sequences = True, dropout = 0.2))
        
    def build(self, input_shape):
        self.built = True

    def call(self, inputs):
        return self.modelling2(self.modelling1(inputs))
    
    def get_config(self):

        config = super().get_config().copy()
        config.update({
            'output_dim': self.output_dim,
        })
        return config

    @classmethod
    def from_config(cls, config):
      return cls(**config)

In [None]:
# utils/layers
class Start(Layer):
    
    def __init__(self, **kwargs):
        
        super(Start, self).__init__(**kwargs)
        self.dense = Dense(1, activation = 'linear', use_bias = False)
        self.dropout = Dropout(0.2)
        
    def build(self, input_shape):
        self.built = True

    def call(self, inputs):
        
        GM = inputs
        start = self.dense(GM)
        start = self.dropout(start)
        p1 = tf.nn.softmax(tf.squeeze(start, axis = 2))
        return p1

    def get_config(self):
      
      config = super().get_config().copy()
      return config

    @classmethod
    def from_config(cls, config):
      return cls(**config)

In [None]:
# utils/layers
class ModellingEnd(Layer):
    
    def __init__(self, output_dim, **kwargs):
        
        super(ModellingEnd, self).__init__(**kwargs)
        self.output_dim = output_dim
        self.end = Bidirectional(LSTM(self.output_dim, return_sequences = True, dropout = 0.2))
        
    def build(self, input_shape):
        self.built = True

    def call(self, inputs):
        
        G, M = inputs
        M2 = self.end(M)
        GM2 = tf.concat([G, M2], axis = 2)
        return GM2
    
    def get_config(self):

        config = super().get_config().copy()
        config.update({
            'output_dim': self.output_dim,
        })
        return config

    @classmethod
    def from_config(cls, config):
      return cls(**config)

In [None]:
# utils/layers
class End(Layer):
    
    def __init__(self, **kwargs):
        
        super(End, self).__init__(**kwargs)
        self.dense = Dense(1, activation = 'linear', use_bias = False)
        self.dropout = Dropout(0.2)
        
    def build(self, input_shape):
        self.built = True

    def call(self, inputs):
        
        GM2 = inputs
        end = self.dense(GM2)
        end = self.dropout(end)
        p2 = tf.nn.softmax(tf.squeeze(end, axis = 2))
        
        return p2


    def get_config(self):

      config = super().get_config().copy()

      return config
    
    @classmethod
    def from_config(cls, config):
      return cls(**config)

In [None]:
# utils/models
class BIDAF(Model):

  def __init__(self, 
               question_maxlen, 
               context_maxlen, 
               word_vocab_len, 
               embedding_size, 
               embedding_matrix, 
               char_vocab_len = None,
               word_maxlen = None, 
               n_filters = None, 
               filter_size = None, 
               char_embedding_size = None, 
               **kwargs):
    
    
    super(BIDAF, self).__init__(name = 'BIDAF', **kwargs)

    self.question_maxlen = question_maxlen
    self.contect_maxlen = context_maxlen
    self.word_vocab_len = word_vocab_len
    self.embedding_size = embedding_size
    self.embedding_matrix = embedding_matrix
    self.char_vocab_len = char_vocab_len
    self.char_embedding_size = char_embedding_size
    self.word_max_len = word_maxlen
    self.n_filters = n_filters
    self.filter_size = filter_size

    self.similarity_weights = Dense(1, use_bias = False)

    # layers
    self.word_embedding = WordEmbedding(self.word_vocab_len, self.embedding_size, self.question_maxlen, self.embedding_matrix)
    self.char_embedding = CharEmbedding(self.char_vocab_len, self.char_embedding_size, self.word_max_len)
    self.cnn = CharCNN(self.n_filters, self.filter_size)
    self.highway = HighwayNetwork(hidden_size = self.embedding_size + self.n_filters)
    self.contextual = ContextualEmbedding(self.embedding_size)
    self.modelling = Modelling(self.embedding_size)
    self.modelling_end = ModellingEnd(self.embedding_size)
    self.output_start = Start()
    self.ouput_end = End()

  def call(self, inputs, training = True):
    qw, cw, qc, cc = inputs  # (bs, q_len), (bs, ctx_len), (bs, q_len, w_len), (bs, ctx_len, w_len)

    # embedding always non-trainable
    qw = self.word_embedding(qw) # (bs, q_len, emb)
    cw = self.word_embedding(cw) # (bs, ctx_len, emb)

    qc = self.char_embedding(qc) # (bs, q_len, w_len, char_emb)
    cc = self.char_embedding(cc) # (bs, ctx_len, w_len, char_emb)

    qc = self.cnn(qc) # (bs, q_len, n_filters)
    cc = self.cnn(cc) # (bs, ctx_len, n_filters)

    H = tf.concat([cw, cc], axis = 2) # (bs, ctx_len, emb + n_filters)
    U = tf.concat([qw, qc], axis = 2) # (bs, q_len, emb + n_filters)

    # highway
    H = self.highway(H) # (bs, ctx_len, emb + n_filters)
    U = self.highway(U) # (bs, q_len, emb + n_filters)

    # contextual embedding
    H = self.contextual(H) # (bs, ctx_len, emb + n_filters)
    U = self.contextual(U) # (bs, q_len, emb + n_filters)

    # similarity matrix
    expand_h = tf.concat([[1, 1], [tf.shape(U)[1]], [1]], axis = 0) # [1, 1, q_len, 1]
    expand_u = tf.concat([[1], [tf.shape(H)[1]], [1, 1]], axis = 0) # [1, ctx_len, 1, 1]

    h = tf.tile(tf.expand_dims(H, axis = 2), expand_h) # (bs, ctx_len, q_len, emb + n_filters)
    u = tf.tile(tf.expand_dims(U, axis = 1), expand_u) # (bs, ctx_len, q_len, emb + n_filters)
    h_u = h * u # (bs, ctx_len, q_len, emb + n_filters)

    alpha = tf.concat([h, u, h_u], axis = -1) # (bs, ctx_len, q_len, 3 * (emb + n_filters))
    
    similarity_matrix = self.similarity_weights(alpha) # (bs, ctx_len, q_len, 1)
    similarity_matrix = tf.squeeze(similarity_matrix, 3) # (bs, ctx_len, q_len)

    # context to query attention
    attention_weights = tf.nn.softmax(similarity_matrix, axis = -1) # (bs, ctx_len, q_len)
    C2Q = batch_dot(attention_weights, U) # (bs, ctx_len, emb + n_filters)

    # query to context attention
    attention_weights = tf.nn.softmax(tf.math.reduce_max(similarity_matrix, axis = 2), axis = -1) # (bs, ctx_len)
    attention_weights = tf.expand_dims(attention_weights, axis = 1) # (bs, 1, ctx_len)
    Q2C = batch_dot(attention_weights, H) # (bs, 1, emb + n_filters)
    Q2C = tf.tile(Q2C, [1, tf.shape(H)[1], 1]) # (bs, ctx_len, emb + n_filters)

    # query aware representation
    G = tf.concat([H, C2Q, (H * C2Q), (H * Q2C)], axis = 2) # (bs, ctx_len, 4 * (emb + n_filters) )

    # modelling
    M = self.modelling(G) # (bs, ctx_len, emb + n_filters)

    # output
    M2 = self.modelling_end([G,M]) # (bs, ctx_len, emb + n_filters)

    # start prediction
    start = self.output_start(tf.concat([G, M], axis = 2)) # (bs, ctx_len)

    # end prediction
    end = self.ouput_end(M2) # (bs, ctx_len)

    return start, end

In [None]:
bidaf_model = BIDAF(
    QUESTION_MAXLEN,
    CONTEXT_MAXLEN,
    WORD_VOCAB_LEN,
    EMBEDDING_SIZE,
    embedding_matrix,
    CHAR_VOCAB_LEN,
    WORD_MAXLEN,
    N_FILTERS,
    FILTER_SIZE,
    CHAR_EMBEDDING_SIZE,
)

In [None]:
loss_function = tf.keras.losses.CategoricalCrossentropy(reduction = 'auto')
optimizer = tf.keras.optimizers.Nadam(learning_rate = LR)

In [None]:
# https://udai.gitbook.io/practical-ml/nn/training-and-debugging-of-nn <- useful blog about machine learning / deep learning
# steps to be performed in each training step
@tf.function
def train_step(model, input_vector, output_vector, loss_fn):
    with tf.GradientTape() as tape:
        # forward propagation
        output_predicted = model(input_vector, training = True)
        # loss
        loss_start = loss_function(output_vector[0], output_predicted[0])
        loss_end = loss_function(output_vector[1], output_predicted[1])
        loss_final = loss_start + loss_end
    # getting gradients
    gradients = tape.gradient(loss_final, model.trainable_variables)
    # applying gradients
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss_start, loss_end, output_predicted, gradients

In [None]:
# https://udai.gitbook.io/practical-ml/nn/training-and-debugging-of-nn
# steps to be performed in each validation step
@tf.function
def val_step(model, input_vector, output_vector, loss_fn):
    # getting output of validation data
    output_predicted = model(input_vector, training = False)
    # loss calculation
    loss_start = loss_function(output_vector[0], output_predicted[0])
    loss_end = loss_function(output_vector[1], output_predicted[1])
    return loss_start, loss_end, output_predicted

In [None]:
def f1_score(y_true, y_pred):    # taken from old keras source code
    
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    
    f1_val = 2 * (precision * recall) / (precision + recall + K.epsilon())
    
    return f1_val

In [None]:
# defining functions to compute the mean loss for each epoch
train_start_loss = tf.keras.metrics.Mean(name = 'train_start_loss')
train_end_loss = tf.keras.metrics.Mean(name = 'train_end_loss')
val_start_loss = tf.keras.metrics.Mean(name = 'val_start_loss')
val_end_loss = tf.keras.metrics.Mean(name = 'val_end_loss')
train_start_f1 = tf.keras.metrics.Mean(name = 'train_start_f1')
train_end_f1 = tf.keras.metrics.Mean(name = 'train_end_f1')
val_start_f1 = tf.keras.metrics.Mean(name = 'val_start_f1')
val_end_f1 = tf.keras.metrics.Mean(name = 'val_end_f1')
train_start_acc = tf.keras.metrics.CategoricalAccuracy(name = 'train_start_acc')
train_end_acc = tf.keras.metrics.CategoricalAccuracy(name = 'train_end_acc')
val_start_acc = tf.keras.metrics.CategoricalAccuracy(name = 'val_start_acc')
val_end_acc = tf.keras.metrics.CategoricalAccuracy(name = 'val_end_acc')

In [None]:
best_loss = 100 # we initialize a loss value for model checkpoint

In [None]:
for epoch in range(EPOCHS):
    
    # resetting the states of the loss and metrics
    train_start_loss.reset_states()
    train_end_loss.reset_states()
    val_start_loss.reset_states()
    val_end_loss.reset_states()
    train_start_f1.reset_states()
    train_end_f1.reset_states()
    val_start_f1.reset_states()
    val_end_f1.reset_states()
    train_start_acc.reset_states()
    train_end_acc.reset_states()
    val_start_acc.reset_states()
    val_end_acc.reset_states()
    
    # iterating over train data batch by batch
    for text_seq, label_seq in tqdm(iterable = train_dataset, total = len(train_dataset)):
        # train step
        loss_start_, loss_end_, pred_out, gradients = train_step(bidaf_model, text_seq, label_seq, loss_function)
        # adding loss to train loss
        train_start_loss(loss_start_)
        train_end_loss(loss_end_)
        
        # calculating f1 for batch
        f1_start = f1_score(label_seq[0], pred_out[0])
        f1_end = f1_score(label_seq[1], pred_out[1])
        train_start_f1(f1_start)
        train_end_f1(f1_end)
        train_start_acc(label_seq[0], pred_out[0])
        train_end_acc(label_seq[1], pred_out[1])
    
    # validation data
    for text_seq_val, label_seq_val in valid_dataset:
        # getting val output
        loss_val_start, loss_val_end, pred_out_val = val_step(bidaf_model, text_seq_val, label_seq_val, loss_function)
        
        val_start_loss(loss_val_start)
        val_end_loss(loss_val_end)
        
        # calculating metric
        f1_start_val = f1_score(label_seq_val[0], pred_out_val[0])
        f1_end_val = f1_score(label_seq_val[1], pred_out_val[1])
        val_start_f1(f1_start_val)
        val_end_f1(f1_end_val)
        val_start_acc(label_seq_val[0], pred_out_val[0])
        val_end_acc(label_seq_val[1], pred_out_val[1])
    
   
    # printing
    template = '''Epoch {}, Train Start Loss: {:0.6f}, Train Start Acc : {:0.5f}, Start F1 Score: {:0.5f}, Train End Loss: {:0.6f}, Train End Acc : {:0.5f}, End F1 Score: {:0.5f},
    Val Start Loss: {:0.6f}, Val Start Acc : {:0.5f}, Val Start F1 Score: {:0.5f}, Val End Loss: {:0.6f}, Val End Acc : {:0.5f}, Val End F1 Score: {:0.5f}'''

    print(template.format(epoch + 1, train_start_loss.result(), train_start_acc.result(), train_start_f1.result(), 
                          train_end_loss.result(), train_end_acc.result(), train_end_f1.result(),
                          val_start_loss.result(), val_start_acc.result(), val_start_f1.result(),
                          val_end_loss.result(), val_end_acc.result(), val_end_f1.result()))


    if (val_start_loss.result() + val_end_loss.result()) < best_loss:
      print('Saving weights...')
      bidaf_model.save_weights('drive/MyDrive/NLP/BIDAF/utils/models/weights/bidaf_weights')
      print('\n Done !')
      best_loss = (val_start_loss.result() + val_end_loss.result())

100%|██████████| 6961/6961 [44:48<00:00,  2.59it/s]


Epoch 1, Train Start Loss: 3.713853, Train Start Acc : 0.21268, Start F1 Score: 0.09921, Train End Loss: 3.516613, Train End Acc : 0.23350, End F1 Score: 0.11051,
    Val Start Loss: 2.201488, Val Start Acc : 0.44312, Val Start F1 Score: 0.28122, Val End Loss: 1.988928, Val End Acc : 0.47941, Val End F1 Score: 0.32824
Saving weights...


  0%|          | 0/6961 [00:00<?, ?it/s]


 Done !


100%|██████████| 6961/6961 [44:16<00:00,  2.62it/s]


Epoch 2, Train Start Loss: 2.672784, Train Start Acc : 0.42808, Start F1 Score: 0.37917, Train End Loss: 2.524039, Train End Acc : 0.45798, End F1 Score: 0.42111,
    Val Start Loss: 1.754525, Val Start Acc : 0.53219, Val Start F1 Score: 0.45609, Val End Loss: 1.582747, Val End Acc : 0.57153, Val End F1 Score: 0.50670
Saving weights...


  0%|          | 0/6961 [00:00<?, ?it/s]


 Done !


100%|██████████| 6961/6961 [44:25<00:00,  2.61it/s]


Epoch 3, Train Start Loss: 2.402298, Train Start Acc : 0.48576, Start F1 Score: 0.46137, Train End Loss: 2.250612, Train End Acc : 0.51869, End F1 Score: 0.50846,
    Val Start Loss: 1.659768, Val Start Acc : 0.55321, Val Start F1 Score: 0.51884, Val End Loss: 1.506444, Val End Acc : 0.58600, Val End F1 Score: 0.55114
Saving weights...


  0%|          | 0/6961 [00:00<?, ?it/s]


 Done !


100%|██████████| 6961/6961 [44:18<00:00,  2.62it/s]


Epoch 4, Train Start Loss: 2.203818, Train Start Acc : 0.52497, Start F1 Score: 0.51792, Train End Loss: 2.074208, Train End Acc : 0.55607, End F1 Score: 0.56182,
    Val Start Loss: 1.655275, Val Start Acc : 0.55401, Val Start F1 Score: 0.53658, Val End Loss: 1.502723, Val End Acc : 0.59065, Val End F1 Score: 0.57162
Saving weights...


  0%|          | 0/6961 [00:00<?, ?it/s]


 Done !


100%|██████████| 6961/6961 [44:25<00:00,  2.61it/s]
  0%|          | 0/6961 [00:00<?, ?it/s]

Epoch 5, Train Start Loss: 2.055331, Train Start Acc : 0.55568, Start F1 Score: 0.55691, Train End Loss: 1.939746, Train End Acc : 0.58337, End F1 Score: 0.60004,
    Val Start Loss: 1.702703, Val Start Acc : 0.55619, Val Start F1 Score: 0.54585, Val End Loss: 1.561018, Val End Acc : 0.59255, Val End F1 Score: 0.58075


100%|██████████| 6961/6961 [44:28<00:00,  2.61it/s]
  0%|          | 0/6961 [00:00<?, ?it/s]

Epoch 6, Train Start Loss: 1.918788, Train Start Acc : 0.58545, Start F1 Score: 0.59501, Train End Loss: 1.813458, Train End Acc : 0.60949, End F1 Score: 0.63124,
    Val Start Loss: 1.731502, Val Start Acc : 0.55321, Val Start F1 Score: 0.55223, Val End Loss: 1.629714, Val End Acc : 0.58950, Val End F1 Score: 0.59012


100%|██████████| 6961/6961 [44:25<00:00,  2.61it/s]
  0%|          | 0/6961 [00:00<?, ?it/s]

Epoch 7, Train Start Loss: 1.824023, Train Start Acc : 0.60426, Start F1 Score: 0.61978, Train End Loss: 1.720178, Train End Acc : 0.62920, End F1 Score: 0.65666,
    Val Start Loss: 1.750128, Val Start Acc : 0.54965, Val Start F1 Score: 0.54725, Val End Loss: 1.641849, Val End Acc : 0.59203, Val End F1 Score: 0.59088


100%|██████████| 6961/6961 [44:30<00:00,  2.61it/s]
  0%|          | 0/6961 [00:00<?, ?it/s]

Epoch 8, Train Start Loss: 1.731592, Train Start Acc : 0.62444, Start F1 Score: 0.64413, Train End Loss: 1.646126, Train End Acc : 0.64408, End F1 Score: 0.67681,
    Val Start Loss: 1.887377, Val Start Acc : 0.54574, Val Start F1 Score: 0.55190, Val End Loss: 1.752178, Val End Acc : 0.58428, Val End F1 Score: 0.59076


100%|██████████| 6961/6961 [44:26<00:00,  2.61it/s]
  0%|          | 0/6961 [00:00<?, ?it/s]

Epoch 9, Train Start Loss: 1.646621, Train Start Acc : 0.64293, Start F1 Score: 0.66637, Train End Loss: 1.578775, Train End Acc : 0.65967, End F1 Score: 0.69311,
    Val Start Loss: 1.933540, Val Start Acc : 0.53948, Val Start F1 Score: 0.54766, Val End Loss: 1.834540, Val End Acc : 0.57905, Val End F1 Score: 0.58306


100%|██████████| 6961/6961 [44:29<00:00,  2.61it/s]


Epoch 10, Train Start Loss: 1.597884, Train Start Acc : 0.65529, Start F1 Score: 0.68230, Train End Loss: 1.528513, Train End Acc : 0.66967, End F1 Score: 0.70666,
    Val Start Loss: 1.975476, Val Start Acc : 0.54396, Val Start F1 Score: 0.54932, Val End Loss: 1.815104, Val End Acc : 0.58163, Val End F1 Score: 0.58247


In [None]:
def print_predictions(batch):

  idx = np.random.randint(BATCH_SIZE)
  samples = valid_dataset[batch]

  sequences, labels = samples

  qw = sequences[0][idx]
  cw = sequences[1][idx]
  qc = sequences[2][idx]
  cc = sequences[3][idx]

  real_start = labels[0][idx]
  real_end = labels[1][idx]

  """
  Function that takes record numbers as input and predicts the answer for that record
  """

  print('Question:')
  for i in qw:
    if i == 0:
      break
    else:
      print(tokenizer.index_word[i], end = ' ')

  print('\nContext:')
  for i in cw:
    if i == 0:
      break
    else:
      print(tokenizer.index_word[i], end = ' ')
      
  print('\nPredicted Answer:')
  _qw = qw.reshape(1, qw.shape[0])
  _cw = cw.reshape(1, cw.shape[0])
  _qc = np.expand_dims(qc, axis = 0)
  _cc = np.expand_dims(cc, axis = 0)
  start, end = bidaf_model.predict((_qw, _cw, _qc, _cc))
  start = start.argmax()
  end = end.argmax() + 1

  if start > end:
    start = end
    end = start

  for i in range(start, end ):
    print(tokenizer.index_word[cw[i]], end = ' ')
  print('\n')

In [None]:
data_points = [8,15,52,152,332]
for i in data_points:
  print_predictions(i)

Question:
how many consecutive years was american idol the top rated show ? 
Context:
seasonal rankings ( based on average total viewers per episode ) of american idol . it holds the distinction of having the longest winning streak in the nielsen annual television ratings ; it became the highest-rated of all television programs in the united states overall for an unprecedented seven consecutive years , or eight consecutive ( and total ) years when either its performance or result show was ranked number one overall . 
Predicted Answer:
seven consecutive 

Question:
who owned the rights to oswald ? 
Context:
universal owned the rights to the `` oswald the lucky rabbit '' character , although walt disney and ub iwerks had created oswald , and their films had enjoyed a successful theatrical run . after charles mintz had unsuccessfully demanded that disney accept a lower fee for producing the property , mintz produced the films with his own group of animators . instead , disney and iwerks c

In [None]:
# class BIDAF():

#   """
#   the BIDAF model
#   """

#   def __init__(self, model, path_tokenizer, path_char_tokenizer = None, char_level = True):

#     self.QUESTION_MAXLEN = 20
#     self.CONTEXT_MAXLEN = 300
#     self.WORD_MAXLEN = 15
#     self.char_level = char_level
#     #self.path_model = path_model
#     self.path_tokenizer = path_tokenizer
#     self.path_char_tokenizer = path_char_tokenizer
#     self.model = model
#     with open(self.path_tokenizer, 'rb') as handle:
#       self.tokenizer = pickle.load(handle)
#     if self.char_level:
#       with open(self.path_char_tokenizer, 'rb') as handle:
#         self.char_tokenizer = pickle.load(handle)
  
#   def _get_tokens(self):

#     self.question = self.tokenizer.texts_to_sequences([self.question])
#     self.context = self.tokenizer.texts_to_sequences([self.context])

#   def _get_padded_sequences(self):

#     self.question = tf.keras.preprocessing.sequence.pad_sequences(self.question, maxlen = self.QUESTION_MAXLEN, padding = 'post')
#     self.context = tf.keras.preprocessing.sequence.pad_sequences(self.context, maxlen = self.CONTEXT_MAXLEN, padding = 'post')

#   def predict(self, question, context):

#     self._q = question
#     self._c = context

#     self.question = question
#     self.context = context
#     self._get_tokens()
#     self._get_padded_sequences()

#     if self.char_level:
#       self.__get_tokens()
#       self.__get_padded_sequences()
#       start, end = self.model.predict([self.question, self.context, self.question_char, self.context_char])
#     else:
#       start, end = self.model.predict([self.question, self.context])

#     for i in range(start.argmax(), end.argmax() + 1):
#       print(self.tokenizer.index_word[self.context[0][i]], end = ' ')

#   def __get_tokens(self):

#     self._question_char = []
#     self._context_char = []

#     for question, context in zip(self._q, self._c):
#       _q = self.char_tokenizer.texts_to_sequences(question)
#       _c = self.char_tokenizer.texts_to_sequences(context)
#       self._question_char.append(_q)
#       self._context_char.append(_c)

#   def __get_padded_sequences(self):

#     # pad question at the character level
#     v = tf.keras.preprocessing.sequence.pad_sequences(self._question_char, padding = 'post', maxlen = self.WORD_MAXLEN)
#     to_add = self.QUESTION_MAXLEN - v.shape[0]
#     add = np.zeros((to_add, WORD_MAXLEN))
#     arr = np.vstack([v,add])
#     self.question_char = arr

#     # pad context at the character level
#     v = tf.keras.preprocessing.sequence.pad_sequences(self._context_char, padding = 'post', maxlen = self.WORD_MAXLEN)
#     to_add = self.CONTEXT_MAXLEN - v.shape[0]
#     add = np.zeros((to_add, WORD_LEN))
#     arr = np.vstack([v,add])
#     self.context_char = arr

In [None]:
# bidaf = BIDAF(
#     model = model,
#     path_tokenizer = 'drive/MyDrive/NLP/data/tokenizer.pickle',
#     path_char_tokenizer = 'drive/MyDrive/NLP/data/char_tokenizer.pickle',
#     char_level = True
# )

In [None]:
# question = 'In what country is Normandy located?'
# context = "The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ('Norman' comes from 'Norseman') raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries."

In [None]:
# bidaf.predict(question, context)

**FURTHER WORK**:
* try with GRU instead of LSTM (GRU are usually faster)