In [1]:
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False

%load_ext autoreload
%autoreload 2

import os
import json
import sys
sys.path.append(".")
sys.path.append("..")

import spacy
# Download the spacy model first: python -m spacy download en_core_web_sm

import tensorflow as tf
import numpy as np
import pandas as pd
# disable chained assignments to avoid annoying warning
pd.options.mode.chained_assignment = None
import matplotlib.pyplot as plt
import datetime
import tensorflow as tf
from tensorflow import keras
from tqdm.auto import tqdm

from typing import Any, Tuple, List, NamedTuple

# Dataloader & Configs
from src.data.data_generator import SQuAD, Dataset
from configs.config import *

# Embeddings
from src.utils.embeddings import GloVe

# Model
from src.models.layers import Encoder, Decoder, CustomMasking
from src.models.loss import MaskedLoss
from src.models.trainers.metrics import Perplexity, MaskedAccuracy
from src.models.callbacks import BatchLogs, CustomLearningRateScheduler, GetEpochNumber

In [2]:
dataset_creator = SQuAD()
X_train, y_train, X_val, y_val, X_test, y_test = dataset_creator(dataset_config, path_config, tokenized=False, tensor_type=False, compute_pos=True)

File already exists! Loading from .pkl...

Dir path ./data/squadv1.1.pkl


In [16]:
dataset_creator = SQuAD()
X_train_tokenized, y_train_tokenized, X_val_tokenized, y_val_tokenized, X_test_tokenized, y_test_tokenized = dataset_creator(dataset_config, path_config, tokenized=True, tensor_type=False, compute_pos=False)

File already exists! Loading from .pkl...

Dir path ./data/squadv1.1.pkl


In [3]:
dataset_creator = SQuAD()
dataset, word_to_idx_context, word_to_idx_question = dataset_creator(dataset_config, path_config, tokenized=True, compute_pos=True)

max_length_context = dataset_creator.max_length_context
max_length_question = dataset_creator.max_length_question

model_config["max_length_context"] = dataset.train.element_spec[0].shape[1]
model_config["max_length_question"] = dataset.train.element_spec[1].shape[1]

print(f'Sentences max lenght: {max_length_context}')
print(f'Questions max lenght: {max_length_question}')

File already exists! Loading from .pkl...

Dir path ./data/squadv1.1.pkl
Computing POS tags for the train set...


Computing POS tags for the context...: 100%|██████████| 100/100 [00:03<00:00, 31.40seq/s]
Computing POS tags for the question...: 100%|██████████| 100/100 [00:01<00:00, 70.39seq/s]


Computing POS tags for the val set...


Computing POS tags for the context...: 100%|██████████| 100/100 [00:02<00:00, 36.22seq/s]
Computing POS tags for the question...: 100%|██████████| 100/100 [00:01<00:00, 67.39seq/s]


Computing POS tags for the test set...


Computing POS tags for the context...: 100%|██████████| 100/100 [00:02<00:00, 38.61seq/s]
Computing POS tags for the question...: 100%|██████████| 100/100 [00:01<00:00, 75.89seq/s]


Instructions for updating:
Use `tf.data.Dataset.save(...)` instead.
Sentences max lenght: 102
Questions max lenght: 27


In [None]:
from src.models.trainers.trainer import Trainer

# Utility function in order to build the compiled model
def build_model(model_config,
                embedding_matrix_context,
                embedding_matrix_question,
                compile_info):
  print("Model Configuration \nParameters: {}".format(model_config))
  print("Compile \nParameters: {}".format(compile_info))
  model = Trainer(model_config,
                  embedding_matrix_context=embedding_matrix_context,
                  embedding_matrix_question=embedding_matrix_question)

  model.compile(**compile_info)
  return model

def train_model(model,
                dataset: NamedTuple,
                training_info):
    """
    Training routine for the Keras model.
    At the end of the training, retrieved History data is shown.

    :param model: Keras built model
    :param dataset: the split dataset
    :param training_info: dictionary storing model fit() argument information

    :return
        model: trained Keras model
    """
    print("Start training \nParameters: {}".format(training_info))
    history = model.fit(dataset.train,
                        validation_data=dataset.val,
                        use_multiprocessing=True,
                        **training_info)
    print("Training completed")
    return history, model

NameError: name 'decoder_logits' is not defined

In [None]:
# Those should go into the configuration file

# Initialize the callbacks
batch_loss = BatchLogs('batch_loss')
perplexity = BatchLogs('perplexity')
accuracy = BatchLogs('accuracy')
lr_scheduler = CustomLearningRateScheduler()
epoch_counter = GetEpochNumber()
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_perplexity', 
                                                  patience=3, 
                                                  mode='min', 
                                                  restore_best_weights=True) 

training_info = {
    'verbose': 1,
    'epochs': 20,
    'batch_size': dataset_config['batch_size'],
    'callbacks': [
                  batch_loss,
                  perplexity,
                  accuracy,
                  # lr_scheduler,
                  # tensorboard_callback,
                  # epoch_counter,
                  early_stopping
                  ],
}

model_config = {
    'batch_size': dataset_config['batch_size'],
    'enc_units': 256,
    'dec_units': 256,
    'max_length_context': dataset_creator.max_length_context,
    'max_length_question': dataset_creator.max_length_question,
    'dropout_rate': 0.3,
    'regularizer': 1e-2,
}

compile_info = {
    'loss': MaskedLoss(),
    'optimizer': keras.optimizers.Adam(learning_rate=1e-5)}

In [None]:
qg_model = build_model(model_config, embedding_matrix_context, embedding_matrix_question, compile_info)
history, qg_model = train_model(model=qg_model, dataset=dataset, training_info=training_info)