<a href="https://colab.research.google.com/github/Erhtric/NeuralQuestionGenerationNLP/blob/master/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This is the main file: its purpouse is to collect all the code coming from the coding pipeline.

In [1]:
# See: https://www.tensorflow.org/text/guide/tokenizers
# and: https://www.tensorflow.org/text/guide/subwords_tokenizer
# !pip install -q "tensorflow-text"

In [2]:
import pandas as pd
import numpy as np
import json
import sklearn
from sklearn.model_selection import train_test_split, GroupShuffleSplit
import re
import os
import typing
from typing import Any, Tuple
import gensim
import gensim.downloader as gloader
from gensim.models.keyedvectors import KeyedVectors
from itertools import chain
from tqdm import tqdm

import tensorflow as tf
# import tensorflow_text as text
from tensorflow import keras
from keras.layers import Layer, Embedding, LSTM, Dense, LSTMCell, Bidirectional, Input, AdditiveAttention

import nltk
from nltk import punkt, pos_tag, ne_chunk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

RANDOM_SEED = 13

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Commands to prepare the folder to accomodate data.

In [4]:
%cd /content/drive/MyDrive/NLP/Project/Testing folder/Eric
%pwd
%mkdir data

# disable chained assignments to avoid annoying warning
pd.options.mode.chained_assignment = None 

/content/drive/.shortcut-targets-by-id/1cVw6eUwM-dRL9BhqtXULyOqeXDrYkwmH/NLP/Project/Testing folder/Eric
mkdir: cannot create directory ‘data’: File exists


In [5]:
from base.base_model import BaseModel

# 1. Data handling and Pre-processing


Things to do:
1. Add to each sentence $x$ a start of sequence `<SOS>` tag and end of sequence `<EOS>` tag,
2. Clean the sentences by removing special chars,
3. Perform other preprocessing steps,
4. Create a **vocabulary** with a word-to-index and index-to-word mappings by using a **tokenizer**, 
5. Extract the sentences that contain an aswer and use them as input features,
6. Pad each context to maximum length.

In [6]:
JSON_PATH ='./data/training_set.json'
SAVE_PATH = './data/squadv2.pkl'

class SQuAD:
  def __init__(self):
    self.random_seed = None
    self.squad_df = None
    self.preproc_squad_df = None
    self.tokenizer = None
    self.og_n_samples = 18896
    self.BUFFER_SIZE = 0
    self.BATCH_SIZE = 0

  def call(self,
           num_examples, 
           BUFFER_SIZE, 
           BATCH_SIZE, 
           random_seed,
           num_words=None,
           tokenized=True,
           tensor_type=True):
    """The call() method loads the SQuAD dataset, preprocess it and optionally it returns 
    it tokenized. Moreover it also perform a 3-way split.

    Args:
        num_examples (int): _description_
        num_words (int): the maximum number of words to keep, based on word frequency. Only the most common num_words-1 words will be kept. 
        BUFFER_SIZE (int): _description_
        BATCH_SIZE (int): _description_
        tokenized (boolean): specifies if the context and question data should be both tokenized

    Returns:
        pd.DataFrame or tf.Data.Dataset: training dataset
        pd.DataFrame or tf.Data.Dataset: validation dataset
        pd.DataFrame or tf.Data.Dataset: testing dataset
        tf.keras.preprocessing.text.Tokenizer: fitted tokenizer object for the SQuAD dataset
    """
    self.random_seed = random_seed
    self.BUFFER_SIZE = BUFFER_SIZE
    self.BATCH_SIZE = BATCH_SIZE

    # Load dataset from file
    self.load_dataset(num_examples)
    # Extract answer
    self.extract_answer()
    # Preprocess context and question
    self.preprocess()

    # Add POS, NER, etc.
    
    # Perform splitting
    X_train, y_train, X_val, y_val, X_test, y_test = self.split_train_val(self.preproc_squad_df)
    
    # Initialize Tokenizer
    self.tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', 
                                                           oov_token='<unk>',
                                                           num_words=num_words)

    if tokenized:
      X_train_tokenized, _ = self.__tokenize_context(X_train)
      y_train_tokenized, word_to_index_train = self.__tokenize_question(y_train)

      X_val_tokenized, _ = self.__tokenize_context(X_val)
      y_val_tokenized, word_to_index_val = self.__tokenize_question(y_val)

      X_test_tokenized, _ = self.__tokenize_context(X_test)
      y_test_tokenized, word_to_index_test = self.__tokenize_question(y_test)
      if tensor_type:
        # Returns tf.Data.Dataset objects (tokenized)
        train_dataset = self.to_tensor(X_train_tokenized, y_train_tokenized, BUFFER_SIZE, BATCH_SIZE)
        val_dataset = self.to_tensor(X_val_tokenized, y_val_tokenized,  BUFFER_SIZE, BATCH_SIZE)
        test_dataset = self.to_tensor(X_test_tokenized, y_test_tokenized, BUFFER_SIZE, BATCH_SIZE)
        return train_dataset, val_dataset, test_dataset, word_to_index_train, word_to_index_val, word_to_index_test
      else:
        # Returns pd.DataFrame objects (tokenized)
        return X_train_tokenized, y_train_tokenized, X_val_tokenized, y_val_tokenized, X_test_tokenized, y_test_tokenized
    else:
      return X_train, y_train, X_val, y_val, X_test, y_test

  def load_dataset(self, num_examples):
    """
    Extract the dataset from the json file. Already grouped by title.

    :param path: [Optional] specifies the local path where the training_set.json file is located

    :return
        - the extracted dataset in a dataframe format
    """
    if os.path.exists(SAVE_PATH):
      print('File already exists! Loading from .pkl...\n')
      self.squad_df = pd.read_pickle(SAVE_PATH)
      self.squad_df = self.squad_df[:num_examples]
    else:
      print('Loading from .json...\n')
      with open(JSON_PATH) as f:
          data = json.load(f)

      df_array = []
      for current_subject in data['data']:
          title = current_subject['title']

          for current_context in current_subject['paragraphs']:
              context = current_context['context']

              for current_question in current_context['qas']:
                  question = current_question['question']
                  id = current_question['id']

              for answer_text in current_question['answers']:
                    answer = answer_text['text']
                    answer_start = answer_text['answer_start']
                    record = { "id": id,
                                "title": title,
                                "context": context,
                                "question": question,
                                "answer_start": answer_start,
                                "answer": answer
                                }

              df_array.append(record)
      
      # Save file
      pd.to_pickle(pd.DataFrame(df_array), SAVE_PATH)
      self.squad_df = pd.DataFrame(df_array)[:num_examples]

  def preprocess(self):
    df = self.squad_df.copy()

    # Pre-processing context
    context = list(df.context)
    preproc_context = []

    for c in context:
      c = self.__preprocess_sentence(c, question=False)
      preproc_context.append(c)
    
    df.context = preproc_context

    # Pre-processing questions
    question = list(df.question)
    preproc_question = []

    for q in question:
      q = self.__preprocess_sentence(q, question=True)
      preproc_question.append(q)
    
    df.question = preproc_question

    # Remove features that are not useful
    df = df.drop(['id'], axis=1)
    self.preproc_squad_df = df

  def __preprocess_sentence(self, sen, question):
    # Creating a space between a word and the punctuation following it
    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    sen = re.sub(r"([?.!,¿])", r" \1 ", sen)
    sen = re.sub(r'[" "]+', " ", sen)

    # Replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    sen = re.sub(r"[^a-zA-Z?.!,¿]+", " ", sen)

    sen = sen.strip()

    # Adding a start and an end token to the sentence so that the model know when to 
    # start and stop predicting.
    # if not question: sen = '<SOS> ' + sen + ' <EOS>'
    sen = '<SOS> ' + sen + ' <EOS>'
    return sen

  def __answer_start_end(self, df):
    """
    Creates a list of starting indexes and ending indexes for the answers.

    :param df: the target Dataframe

    :return: a dataframe containing the start and the end indexes foreach answer (ending index is excluded).

    """
    start_idx = df.answer_start
    end_idx = [start + len(list(answer)) for start, answer in zip(list(start_idx), list(df.answer))]
    return pd.DataFrame(list(zip(start_idx, end_idx)), columns=['start', 'end'])

  def split_train_val(self, df, train_size=0.8):
    """
    This method splits the dataframe in training and test sets, or eventually, in training, validation and test sets.

    Args
        :param df: the target Dataframe
        :param random_seed: random seed used in the splits
        :param train_size: represents the absolute number of train samples
        :param val: boolean for choosing between a 3-way split or 2-way one.

    Returns:
        - Data and labels for training, validation and test sets if val is True 
        - Data and labels for training and test sets if val is False 

    """
    # Maybe we have also to return the index for the starting answer
    X = df.drop(['answer_start', 'question', 'answer'], axis=1).copy()
    idx = self.__answer_start_end(df)
    X['start'] = idx['start']
    X['end'] = idx['end']
    y = df['question']

    # In the first step we will split the data in training and remaining dataset
    splitter = GroupShuffleSplit(train_size=train_size, n_splits=2, random_state=self.random_seed)
    split = splitter.split(X, groups=X['title'])
    train_idx, rem_idx = next(split)

    X_train = X.iloc[train_idx]
    y_train = y.iloc[train_idx]
    X_rem = X.iloc[rem_idx]
    y_rem = y.iloc[rem_idx]


    # Val and test test accounts for 10% of the total data. Both 5%.
    splitter = GroupShuffleSplit(train_size=train_size, n_splits=2, random_state=self.random_seed)
    split = splitter.split(X_rem, groups=X_rem['title'])
    val_idx, test_idx = next(split)

    X_val = X_rem.iloc[val_idx]
    y_val = y_rem.iloc[val_idx]

    X_test = X_rem.iloc[test_idx]
    y_test = y_rem.iloc[test_idx]

    return X_train, y_train, X_val, y_val, X_test, y_test

  def __tokenize_context(self, X):
    context = X.context
    self.tokenizer.fit_on_texts(context)
    context_tf = self.tokenizer.texts_to_sequences(context)
    context_tf_pad = tf.keras.preprocessing.sequence.pad_sequences(context_tf, padding='post')

    for i, _ in enumerate(context):
      X['context'].iloc[i] = context_tf_pad[i]

    return X, self.tokenizer.word_index

  def __tokenize_question(self, y):
    question = y
    self.tokenizer.fit_on_texts(question)
    question_tf = self.tokenizer.texts_to_sequences(question)
    question_tf_pad = tf.keras.preprocessing.sequence.pad_sequences(question_tf, padding='post')

    for i, _ in enumerate(question):
      y.iloc[i] = question_tf_pad[i]
    
    # Add the padding
    self.tokenizer.word_index['<pad>'] = 0
    self.tokenizer.index_word[0] = '<pad>'

    return y, self.tokenizer.word_index

  def extract_answer(self):
    df = self.squad_df.copy()
    start_end = self.__answer_start_end(df)
    context = list(df.context)
    
    selected_sentences = []
    for i, par in enumerate(context):
      sentences = sent_tokenize(par)
      start = start_end.iloc[i].start
      end = start_end.iloc[i].end      
      right_sentence = ""
      context_characters = 0

      for j, sen in enumerate(sentences):
        sen += ' '
        context_characters += len(sen)
        # If the answer is completely in the current sentence
        if(start < context_characters and end <= context_characters):
          right_sentence = sen
          selected_sentences.append(right_sentence)
          break
        # the answer is in both the current and the next sentence
        if(start < context_characters and end > context_characters):
          right_sentence = sen + sentences[j+1]
          selected_sentences.append(right_sentence)
          break 

    self.squad_df.context = selected_sentences

  def to_tensor(self, X, y, BUFFER_SIZE, BATCH_SIZE):
    X = X.context.copy()
    y = y.copy()

    # Reference:- https://www.tensorflow.org/api_docs/python/tf/data/Dataset
    dataset = tf.data.Dataset.from_tensor_slices(
        (tf.cast(list(X), tf.int32), 
         tf.cast(list(y), tf.int32)))
    dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

    return dataset

By calling the `SQuAD` constructor we create a dataset handling object which will be useful for future operations.

In [7]:
dataset_creator = SQuAD()

## 1.1 Preprocessed split

In [8]:
# Preprocessed dataset without tokenizing
# %%time
# X_train, y_train, X_val, y_val, X_test, y_test = dataset_creator.call(num_examples=dataset_creator.og_n_samples+1,
#                                                                       num_words=None,
#                                                                       BUFFER_SIZE=32000,
#                                                                       BATCH_SIZE=64,
#                                                                       random_seed=RANDOM_SEED,
#                                                                       tokenized=False)

# print(f'Set target: {X_train.columns.values}')

# print(f'Train set samples: {X_train.shape[0]}')
# print(f'Validation set samples: {X_val.shape[0]}')
# print(f'Test set samples: {X_test.shape[0]}')

## 1.2 Tokenized split

### 1.2.1 Tensor Ready

In [9]:
# Preprocessed dataset without tokenizing
%%time
train_data, val_data, test_data, word_to_idx_train, word_to_idx_val, word_to_idx_test = dataset_creator.call(num_examples=dataset_creator.og_n_samples+1,
                                                       num_words=None,
                                                       BUFFER_SIZE=32000,
                                                       BATCH_SIZE=64,
                                                       random_seed=RANDOM_SEED,
                                                       tokenized=True)

max_length_context = train_data.element_spec[0].shape[1]
max_length_question = train_data.element_spec[1].shape[1]

print(f'Sentences max lenght: {max_length_context}')
print(f'Questions max lenght: {max_length_question}')

File already exists! Loading from .pkl...

Sentences max lenght: 371
Questions max lenght: 40
CPU times: user 24.3 s, sys: 1.03 s, total: 25.3 s
Wall time: 24.1 s


In [10]:
# for element in train_data:
#   print(element)

### 1.2.2 Standard

In [11]:
# Preprocessed dataset without tokenizing
%%time
X_train, y_train, X_val, y_val, X_test, y_test = dataset_creator.call(num_examples=dataset_creator.og_n_samples+1,
                     BUFFER_SIZE=32000,
                     BATCH_SIZE=64,
                     random_seed=RANDOM_SEED,
                     tokenized=True,
                     tensor_type=False)

print(f'\nSet target: {X_train.columns.values}')

print(f'Train set samples: {X_train.shape[0]}')
print(f'Validation set samples: {X_val.shape[0]}')
print(f'Test set samples: {X_test.shape[0]}')

File already exists! Loading from .pkl...


Set target: ['title' 'context' 'start' 'end']
Train set samples: 15319
Validation set samples: 2845
Test set samples: 732
CPU times: user 23.8 s, sys: 932 ms, total: 24.8 s
Wall time: 24.2 s


## 1.3 Original SQuAD dataset

In [12]:
# Original dataset
squad_df = dataset_creator.squad_df
print(f'[Info] SQuAD target: {list(squad_df.columns.values)}')
print(f'[Info] Shape: {squad_df.shape}')

[Info] SQuAD target: ['id', 'title', 'context', 'question', 'answer_start', 'answer']
[Info] Shape: (18896, 6)


# 2. Embeddings

## 2.1 GloVe initialization

In [13]:
class GloVe:
  def __init__(self, embedding_dimension):
    self.embedding_dimension = embedding_dimension

    try:
      self.embedding_model = KeyedVectors.load(f'./data/glove_model_{self.embedding_dimension}')
    except FileNotFoundError:
      print('[Warning] Model not found in local folder, please wait...')
      self.embedding_model = self.load_glove()
      self.embedding_model.save(f'./data/glove_model_{self.embedding_dimension}')  
      print('Download finished. Model loaded!')

  def load_glove(self):
    """
    Loads a pre-trained GloVe embedding model via gensim library.

    We have a matrix that associate words to a vector of a user-defined dimension.
    """

    download_path = "glove-wiki-gigaword-{}".format(self.embedding_dimension)

    try:
      emb_model = gloader.load(download_path)
    except ValueError as e:
      print("Generic error when loading GloVe")
      print("Check embedding dimension")
      raise e

    emb_model = gloader.load(download_path)
    return emb_model

  def build_embedding_matrix(self, word_to_idx, vocab_size: int) -> np.ndarray:
    """
    Builds the embedding matrix of a specific dataset given a pre-trained word embedding model

    :param word_to_idx: vocabulary map (word -> index) (dict)
    :param vocab_size: size of the vocabulary

    :return
        - embedding matrix that assigns a high dimensional vector to each word in the 
        dataset specific vocabulary (shape |V| x d)
    """
    embedding_matrix = np.zeros((vocab_size, self.embedding_dimension), dtype=np.float32)
    oov_count = 0
    oov_words = []

    # For each word which is not present in the vocabulary we assign a random vector, otherwise we take the GloVe embedding
    for word, idx in tqdm(word_to_idx.items()):
      try:
        embedding_vector = self.embedding_model[word]
      except (KeyError, TypeError):
        oov_count += 1
        oov_words.append(word)
        # embedding_vector = np.random.uniform(low=-0.05, high=0.05, size=embedding_dimension)
        embedding_vector = np.random.uniform(low=-0.05, 
                                             high=0.05, 
                                             size=self.embedding_dimension)

      embedding_matrix[idx] = embedding_vector
    
    print(f'\n[Debug] {oov_count} OOV words found!')
    return embedding_matrix, oov_words

In [14]:
embedding_handler = GloVe(embedding_dimension=100)
embedding_model = embedding_handler.embedding_model
vocab_size = len(word_to_idx_val)
embedding_matrix, oov_words = embedding_handler.build_embedding_matrix(word_to_idx_val, len(word_to_idx_val))

100%|██████████| 40442/40442 [00:00<00:00, 110833.01it/s]


[Debug] 4143 OOV words found!





# 3. Model Definition

## 3.1 Encoder
We will use a bidirectional LSTM to encode the sentence,
$$
\begin{align*}
\overrightarrow{b_t} &= \overrightarrow{\text{LSTM}_2}(x_t, \overrightarrow{b_{t-1}})\\
\overleftarrow{b_t} &= \overleftarrow{\text{LSTM}_2}(x_t, \overleftarrow{b_{t+1}})\\
\end{align*}
$$
where $\overrightarrow{b_t}$ is the hidden state at time step $t$ for the forward pass LSTM and $\overleftarrow{b_t}$ for the backward pass.

In [15]:
enc_units = 256
example_context_batch, example_question_batch = next(iter(train_data))

In [59]:
class BahdanauAttention(Layer):
  # Reference:- https://www.tensorflow.org/text/tutorials/nmt_with_attention
  def __init__(self, units):
    super().__init__()
    self.W1 = Dense(units, use_bias=False)
    self.W2 = Dense(units, use_bias=False)

    self.attention = AdditiveAttention()

  def call(self, query, value, mask):
    """
    This layer takes 3 inputs:
      - the query; this will be generated by the decoder, later,
      - the value: the output of the encoder,
      - the mask: to exclude the padding, i.e., context_batch != 0.
    """
    #W1@ht
    w1_query = self.W1(query)
    #W2@hs
    w2_key = self.W2(value)

    query_mask = tf.ones(tf.shape(query)[:-1], dtype=bool)
    value_mask = mask

    context_vector, attention_weights = self.attention(
        inputs = [w1_query, value, w2_key],
        mask = [query_mask, value_mask],
        return_attention_scores = True,
    )
    
    return context_vector, attention_weights

In [31]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dimension, enc_units, batch_size, max_length_context, **kwargs):
    super(Encoder, self).__init__(**kwargs)
    self.batch_size = batch_size
    self.enc_units = enc_units
    self.max_length_context = max_length_context
    self.embedding_dimension = embedding_dimension

    # Layer definition
    self.embedding = Embedding(input_dim=vocab_size,
                               output_dim=embedding_dimension,
                               input_length=max_length_context,
                               embeddings_initializer=keras.initializers.Constant(embedding_matrix),
                               trainable=False,
                               mask_zero=False,
                               name='Embedding layer')
        
    # The LSTM forward pass
    self.forward_lstm_layer = LSTM(self.enc_units//2,
                                  return_sequences=True,
                                  return_state=True,
                                  recurrent_initializer='glorot_uniform',
                                  name='F1 layer')
    
    # The LSTM backward pass
    self.backward_lstm_layer = LSTM(self.enc_units//2,
                                  return_sequences=True,
                                  return_state=True,
                                  go_backwards=True,
                                  recurrent_initializer='glorot_uniform',
                                  name='B1 layer')

    # The Bidirectional wrapper
    self.bidirectional_lstm = Bidirectional(self.forward_lstm_layer, 
                                            backward_layer=self.backward_lstm_layer, 
                                            # input_shape=(max_length, embedding_dimension),\
                                            name='Encoder__LSTM', 
                                            merge_mode='concat')
          
  def call(self, inputs, hidden):
    # shape = (batch_size, max_length_context)
    x = self.embedding(inputs)

    # shape = (batch_size, max_length_context, embedding_dimension)
    encoder_outputs, forward_h, forward_c, backward_h, backward_c = self.bidirectional_lstm(x, initial_state=hidden)
    
    # shape = (batch_size, units)
    h_concat = tf.concat([forward_h, backward_h], axis=1)
    c_concat = tf.concat([forward_c, backward_c], axis=1)
    return encoder_outputs, [h_concat, c_concat]

  def initialize_hidden_state(self):
    # Reference :- https://keras.io/api/layers/recurrent_layers/bidirectional/ || Call arguments
    return [tf.zeros((self.batch_size, self.enc_units//2)), 
            tf.zeros((self.batch_size, self.enc_units//2)),
            tf.zeros((self.batch_size, self.enc_units//2)), 
            tf.zeros((self.batch_size, self.enc_units//2))]

In [32]:
## Test Encoder Stack
context_encoder = Encoder(
    vocab_size, 
    embedding_handler.embedding_dimension, 
    enc_units, 
    dataset_creator.BATCH_SIZE, 
    max_length_context)

sample_hidden = context_encoder.initialize_hidden_state()
encoder_outputs, [h_concat, c_concat] = context_encoder(example_context_batch, sample_hidden)

## 3.2 Decoder

In [40]:
# Container classes
# Reference :- https://www.tensorflow.org/text/tutorials/nmt_with_attention
class DecoderInput(typing.NamedTuple):
  new_tokens: Any
  enc_output: Any
  mask: Any

class DecoderOutput(typing.NamedTuple):
  logits: Any
  attention_weights: Any

dec_units = 256

In [60]:
class Decoder(tf.keras.Model):
  def __init__(self, 
               vocab_size, 
               embedding_dimension, 
               dec_units, 
               batch_size, 
               max_length_question,
               **kwargs):
    
    super(Decoder, self).__init__(**kwargs)
    self.vocab_size = vocab_size
    self.batch_size = batch_size
    self.dec_units = dec_units
    self.max_length_question = max_length_question
    self.embedding_dimension = embedding_dimension

    # Layer definition
    # Embedding for the questions
    self.embedding = Embedding(input_dim=vocab_size,
                               output_dim=embedding_dimension,
                               input_length=max_length_question,
                               embeddings_initializer=keras.initializers.Constant(embedding_matrix),
                               trainable=False,  #?
                               mask_zero=False,
                               name='Embedding layer')
    
    # The LSTM layer
    self.lstm_layer = LSTM(self.dec_units,
                          return_sequences=True,
                          return_state=True,
                          recurrent_initializer='glorot_uniform',
                          name='Decoding layer')

    # The RNN output will be the query for the attention layer.
    self.attention = BahdanauAttention(self.dec_units)

    # Parameters to be learned
    self.Wt = Dense(self.dec_units, activation=tf.math.tanh, use_bias=False)
    self.Ws = Dense(self.dec_units, activation=tf.nn.softmax, use_bias=False)

    # For the word probabilities
    self.fc = tf.keras.layers.Dense(self.vocab_size)
    
  def call(self, 
            inputs: DecoderInput, 
            state=None) -> Tuple[DecoderOutput, tf.Tensor]:
      
    # Lookup the embeddings for the questions
    x = self.embedding(inputs.new_tokens)

    # Process one step with the RNN
    cell_output, _, cell_state = self.lstm_layer(x, initial_state=state)

    # Use the RNN output as the query for the attention over the encoder output.
    context_vector, attention_weights = self.attention(
        query=cell_output, 
        value=inputs.enc_output, 
        mask=inputs.mask)

    # Join the context_vector and rnn_output [ct; ht] shape: (batch t, value_units + query_units)
    cell_output_and_context_vector = tf.concat([cell_output, context_vector], axis=-1)

    # at = tanh(Wt@[ht, ct])
    attention_vector = self.Wt(cell_output_and_context_vector)

    # logits = softmax(Ws@at)
    logits = self.Ws(attention_vector)
    logits = self.fc(logits)

    return DecoderOutput(logits, attention_weights), cell_state

In [61]:
## Test Decoder Stack
decoder = Decoder(
    vocab_size, 
    embedding_handler.embedding_dimension, 
    dec_units, 
    dataset_creator.BATCH_SIZE, 
    max_length_question
)

In [62]:
# Convert the target sequence, and collect the "[START]" tokens
start_tag_index = word_to_idx_test['<sos>']
first_token = tf.constant([[4]] * dataset_creator.BATCH_SIZE)

In [74]:
decoder_result, decoder_state = decoder(
    inputs = DecoderInput(first_token, 
                          encoder_outputs,
                          mask=(example_context_batch != 0)),
    state = [h_concat, c_concat]
)

print(f'Logits shape: (batch_size, t, output_vocab_size) {decoder_result.logits.shape}')
print(f'State shape: (batch_size, dec_units) {decoder_state.shape}')

Logits shape: (batch_size, t, output_vocab_size) (64, 1, 40442)
State shape: (batch_size, dec_units) (64, 256)


In [81]:
sampled_token = tf.random.categorical(decoder_result.logits[:, 0, :], num_samples=1)
vocab = np.array(list(word_to_idx_val.keys()))

In [83]:
first_word = vocab[sampled_token.numpy()]
first_word[:5]