# Assignment 2 execution

In [None]:
%pip install pandas numpy matplotlib transformers==4.25.1  dataset tensorflow_addons

## Data loading

### Dataset download

In [None]:
import os
import urllib.request
from tqdm import tqdm

class DownloadProgressBar(tqdm):
    def update_to(self, b=1, bsize=1, tsize=None):
        if tsize is not None:
            self.total = tsize
        self.update(b * bsize - self.n)
        
def download_url(url, output_path):
    with DownloadProgressBar(unit='B', unit_scale=True,
                             miniters=1, desc=url.split('/')[-1]) as t:
        urllib.request.urlretrieve(url, filename=output_path, reporthook=t.update_to)

def download_data(data_path, url_path, suffix):    
    if not os.path.exists(data_path):
        os.makedirs(data_path)
        
    data_path = os.path.join(data_path, f'{suffix}.json')
    if not os.path.exists(data_path):
        print(f"Downloading CoQA {suffix} data split... (it may take a while)")
        download_url(url=url_path, output_path=data_path)
        print("Download completed!")

In [None]:
data_folder = 'Dataset'

In [None]:
# Train & Validation data
train_url = "https://nlp.stanford.edu/data/coqa/coqa-train-v1.0.json"
download_data(data_path=data_folder, url_path=train_url, suffix='train')

# Test data
test_url = "https://nlp.stanford.edu/data/coqa/coqa-dev-v1.0.json"
download_data(data_path=data_folder, url_path=test_url, suffix='test')

### Dataset loading

In [None]:
import numpy as np
import pandas as pd
import json
from os import path
from matplotlib import pyplot as plt

In [None]:
def load_coqa_dataset(filename:str) -> pd.DataFrame:
    with open(path.join(data_folder, filename)) as file_obj:
        data_arr = json.load(file_obj)["data"]
    print(f'{len(data_arr)} stories / {len(data_arr[0]["questions"])} questions in the first row')
    
    # Prepare the Categorical DTypes

    storyDType = pd.CategoricalDtype(pd.unique([story["story"] for story in data_arr]))
    print(f"{storyDType.categories.size} distinct stories")

    sourceDType = pd.CategoricalDtype(pd.unique([story["source"] for story in data_arr]))
    print(f"{sourceDType.categories.size} distinct sources: {sourceDType.categories}")
    
    # Transform into a numpy matrix (denormalization and categorical factorization)

    data_arr = np.array([
        [
            sourceDType.categories.get_loc(story["source"]), # Sources factorization
            storyDType.categories.get_loc(story["story"]), # Sources factorization
            story["questions"][question_index]["input_text"],
            story["answers"][question_index]["input_text"],
            story["answers"][question_index]["span_text"],
            #story["questions"][question_index]["turn_id"],
        ]
        for story in data_arr
        for question_index in range(len(story["questions"]))
        if story["answers"][question_index]["input_text"] != 'unknown'
    ])
    print(f'{data_arr.shape} question-answer pairs x columns')
    print(f'First row: {data_arr[0]}')
    
    # Transform into a DataFrame
    
    # https://marcobonzanini.com/2021/09/15/tips-for-saving-memory-with-pandas/
    # https://pandas.pydata.org/docs/user_guide/categorical.html
    return pd.DataFrame({
        "source": pd.Series(pd.Categorical.from_codes(data_arr[:,0].astype(np.int16), dtype=sourceDType)),
        "p": pd.Series(pd.Categorical.from_codes(data_arr[:,1].astype(np.int16), dtype=storyDType)),
        "q": data_arr[:,2],
        "a": data_arr[:,3],
        "span": data_arr[:,4],
    })

In [None]:
def append_shifted_question_to_coqa_dataset(df:pd.DataFrame, shift:int) -> (pd.DataFrame,int):
    if shift <= 0:
        raise Exception("shift must be grater than zero")
    history_mask = df["p"] == df["p"].shift(shift)
    history_series = df["q"].shift(shift).astype(str) + ' ' + df["a"].shift(shift) + '. ' + df["history"]
    df["history"] = np.where(history_mask, history_series, df["history"])
    return df, history_mask.sum()

In [None]:
def transform_coqa_dataset_with_history(df:pd.DataFrame, max_history_depth:int=3) -> pd.DataFrame:
    df["history"] = ""
    for i in range(1, max_history_depth+1):
        df, count = append_shifted_question_to_coqa_dataset(df, i)
        print(i, count)
        if(count == 0):
            break;
    return df

In [None]:
train_df = load_coqa_dataset("train.json")
train_df.count()

In [None]:
df2 = transform_coqa_dataset_with_history(train_df.head(50))
df2.to_csv("x.csv")
df2

In [None]:
pd.unique(train_df["p"]).size

In [None]:
pd.unique(train_df["span"]).size

In [None]:
pd.unique(train_df["source"]).size

In [None]:
train_df.head()

In [None]:
train_df.memory_usage(deep=True)

In [None]:
test_df = load_coqa_dataset("test.json")
test_df.count()

## Data Pre-Processing

### Check unanswerable questions in the Train Dataset

In [None]:
idx = (train_df.a == 'unknown')
unanswerable = train_df[idx]
unanswerable.q.count()

All unanswerable questions in the Train Dataset have been already removed.

## Exploratory Data Analysis

In [None]:
train_df["p"][42]

In [None]:
train_df["q"][42]

In [None]:
train_df["a"][42]

In [None]:
train_df["span"][42]

In [None]:
train_df["source"][42]

### Distribution statistics

Sources:

In [None]:
train_df["source"].hist()

Occurrences of 25 most popular stories:

In [None]:
story_counts = train_df["p"].cat.codes.value_counts(sort=True)
story_counts[:25].plot(kind="bar", figsize=(15,5))

Occurrences of 25 least popular stories:

In [None]:
story_counts[-25:-1].plot(kind="bar", figsize=(15,5))

Histogram of story popularities:

In [None]:
story_counts.hist(log=True,bins=75,figsize=(15,5))

### Removing rows with outlier story lengths to save memory

In [None]:
train_df.count()

LOGARITHMIC histogram of story length:

In [None]:
story_lengths = train_df["p"].str.len()
story_lengths.hist(log=True,bins=75,figsize=(15,5))

In [None]:
p_length_limit = story_lengths.quantile(0.999)
p_length_limit

In [None]:
p_length_mask = story_lengths < p_length_limit
p_length_mask.value_counts()

In [None]:
train_df = train_df[p_length_mask]
train_df.count()

### Removing rows with outlier question/answer/span lengths to save memory

LOGARITHMIC histogram of question length:

In [None]:
question_lengths = train_df["q"].str.len()
question_lengths.hist(log=True,bins=75,figsize=(15,5))

In [None]:
q_length_limit = question_lengths.quantile(0.999)
q_length_limit

LOGARITHMIC histogram of answer length:

In [None]:
answer_lengths = train_df["a"].str.len()
answer_lengths.hist(log=True,bins=75,figsize=(15,5))

In [None]:
a_length_limit = answer_lengths.quantile(0.999)
a_length_limit

In [None]:
span_lengths = train_df["span"].str.len()
span_lengths.hist(log=True,bins=75,figsize=(15,5))

In [None]:
span_length_limit = span_lengths.quantile(0.999)
span_length_limit

In [None]:
bad_length_mask = (question_lengths > q_length_limit) | (answer_lengths > a_length_limit) | (span_lengths > span_length_limit)
bad_length_mask.value_counts()

In [None]:
excluded_stories = train_df["p"][bad_length_mask].unique()
len(excluded_stories)

In [None]:
excluded_mask = ~train_df["p"].isin(excluded_stories)
excluded_mask.value_counts()

In [None]:
train_df = train_df[excluded_mask]
train_df.count()

## Train-Validation-Test split

In [None]:
train_df = train_df.reset_index()

In [None]:
total_rows = len(train_df)
total_rows

In [None]:
ideal_split_index = int(total_rows * 0.8)
ideal_split_index

In [None]:
train_df[ ideal_split_index-3 : ideal_split_index+1 ]

In [None]:
before_split_mask = pd.Series(np.linspace(0, total_rows, total_rows)) < ideal_split_index
before_split_mask.value_counts()

In [None]:
split_story = train_df["p"][ideal_split_index - 1]
split_story_mask = train_df["p"] == split_story
split_story_mask.value_counts()

In [None]:
train_mask = before_split_mask | split_story_mask
train_mask.value_counts()

In [None]:
val_df = train_df[~train_mask]
train_df = train_df[train_mask]
len(val_df)

In [None]:
train_df.memory_usage()

In [None]:
val_df.memory_usage()

In [None]:
test_df.memory_usage()

## Utilities

In [None]:
import gc
import torch
def free_some_memory():
    torch.cuda.empty_cache()
    torch.cuda.reset_accumulated_memory_stats()
    gc.collect()

In [None]:
#del excluded_stories, split_story, before_split_mask, split_story_mask, train_mask
#free_some_memory()

In [None]:
from sklearn.metrics import f1_score
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from typing import List, Dict, Callable
import random
import tensorflow_addons as tfa
from tqdm import tqdm
from timeit import default_timer as timer
from transformers import TFAutoModel, AutoTokenizer

In [None]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

###Reproducibility

In [None]:
def fix_seed(seed: int) -> None:
    """Fix all the possible sources of randomness.

    Args:
        seed: the seed to use. 
    """
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

#fix_seed(seed=7)

###Pytorch Dataset

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class CreateDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

### Train Functionalities


In [None]:
# Train one epoch
def train(model: torch.nn.Module,
          train_loader:torch.utils.data.DataLoader,
          device: torch.device,          
          optimizer: torch.optim,
          epoch: int) -> float:
    """Trains a neural network for one epoch.

    Args:
        model: the model to train.
        train_loader: the data loader containing the training data.
        device: the device to use to train the model.        
        optimizer: the optimizer to use to train the model.
        log_interval: the log interval.
        epoch: the number of the current epoch.

    Returns:
        the cross entropy Loss value on the training data.
    """    
    size_ds_train = len(train_loader.dataset)
    num_batches = len(train_loader)
    loss_score = []
    loop = tqdm(train_loader)
    #set training mode
    model.train()
    for idx_batch, batch in enumerate(loop):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        loss, outputs = model(input_ids,
                              attention_mask=attention_mask,
                              labels = labels
                        )[:2]
        loss_score.append(loss.item())
        loss.backward()
        optimizer.step() 
        loop.set_description(f'Epoch {epoch}')
        #loop.set_postfix(loss=loss.item())

    loss_train = np.mean(loss_score)
    return loss_train

# Validate one epoch
def validate(model: torch.nn.Module,
             data_loader: torch.utils.data.DataLoader,
             device: torch.device) -> float:
    """Evaluates the model.

    Args:
        model: the model to evalaute.
        data_loader: the data loader containing the validation or test data.
        device: the device to use to evaluate the model.

    Returns:
        the loss value on the validation data.
    """
    loss_score = []
    #set evaluation mode
    model = model.eval()
    with torch.no_grad():
        for idx_batch, batch in enumerate(data_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            loss, outputs = model(input_ids,
                                  attention_mask=attention_mask,
                                  labels = labels
                            )[:2]
            loss_score.append(loss.item())
    loss_val = np.mean(loss_score)
    return loss_val

In [None]:
def training_loop(num_epochs: int,
                  optimizer: torch.optim, 
                  model: torch.nn.Module, 
                  loader_train: torch.utils.data.DataLoader, 
                  loader_val: torch.utils.data.DataLoader, 
                  verbose: bool=True) -> Dict:
    """Executes the training loop.
    
        Args:
            num_epochs: the number of epochs.
            optimizer: the optimizer to use.
            model: the mode to train.
            loader_train: the data loader containing the training data.
            loader_val: the data loader containing the validation data.
            verbose: if true print the value of loss.

        Returns:  
            A dictionary with the statistics computed during the train:
            the values for the train loss for each epoch.
            the time of execution in seconds for the entire loop.
    """
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    model.to(device)
    loop_start = timer()
    losses_values = []
    for epoch in range(1, num_epochs + 1):
        time_start = timer()
        loss_train = train(model, loader_train, device, 
                                           optimizer, epoch)
        loss_val = validate(model, loader_val, device)
        time_end = timer()
        losses_values.append(loss_train)
        if verbose:            
            print(f'Epoch: {epoch} '
                  f' Lr: {lr:.8f} '
                  f' Loss: Train = [{loss_train:.4f}] - Val = [{loss_val:.4f}] '
                  f' Time one epoch (s): {(time_end - time_start):.4f} ')
    loop_end = timer()
    time_loop = loop_end - loop_start
    if verbose:
        print(f'Time for {num_epochs} epochs (s): {(time_loop):.3f}') 
        
    return {'loss_values': losses_values,
            'time': time_loop}

### Evaluation

Instead of importing the whole allennlp library, we decided to copy from the source code only of the functions necessary for our task.

In [None]:
#utility functions taken from the allennlp library for computing the F1-score
import collections
import re
import string
from typing import Callable, Sequence, TypeVar, Tuple

def get_tokens(s):
    if not s:
        return []
    return normalize_answer(s).split()

def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""

    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_f1(a_pred: str, a_gold: str) -> float:
    pred_toks = get_tokens(a_pred)
    gold_toks = get_tokens(a_gold)
    common = collections.Counter(pred_toks) & collections.Counter(gold_toks)  # type: ignore[var-annotated]
    num_same = sum(common.values())
    if len(pred_toks) == 0 or len(gold_toks) == 0:
        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
        return float(pred_toks == gold_toks)
    if num_same == 0:
        return 0.0
    precision = 1.0 * num_same / len(pred_toks)
    recall = 1.0 * num_same / len(gold_toks)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

In [None]:
from transformers import EncoderDecoderModel, AutoTokenizer

def generate_answers(model: torch.nn.Module,
                    df_input: pd.DataFrame,
                    max_length: int,
                    tokenizer: any):

    '''
    Given the model and the input, returns a dataframe cointaining the generated answers and relative F1-score
    
    Args:
        model: Torch model used to generate answers
        df_input: dataframe containing the input to the model
        max_length: max length applied in the tokenization
        tokenizer: generic tokenizer
    '''

    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    input_values = tokenizer(list(df_input['q']),list(df_input['p']), 
                                padding=True, 
                                truncation=True, 
                                max_length = max_length)
                                
    input_ids, input_attention_mask = input_values['input_ids'], input_values['attention_mask']
    list_generated = []
    model.to(device)
    # Set the model in evaluation mode
    model.eval()
    for input, mask in zip(input_ids,input_attention_mask):
        input = np.expand_dims(np.array(input), axis=0)
        mask = np.expand_dims(np.array(mask), axis=0)
        generated = model.generate(input_ids=torch.tensor(input).to(device),
                                                        max_length=20,
                                                        repetition_penalty=5.,
                                                        min_length=1,
                                                        no_repeat_ngram_size=3,
                                                        early_stopping=True,
                                                        decoder_start_token_id = model.config.decoder_start_token_id,
                                                        num_beams=2,
                                                        )
        generated = tokenizer.batch_decode(generated, skip_special_tokens=True)
        list_generated.append(generated)

    # Create a dataframe and insert the real answers
    df_generated = pd.DataFrame(list_generated, columns = ['generated'])
    df_generated['answers'] = df_input['a']

    # Generate and insert the F1-score
    score = []
    for a_pred, a_gold in zip(df_generated['generated'], df_generated['answers']):
        score.append(compute_f1(a_pred, a_gold))
    df_generated['score'] = score

    # Print average and len
    average_score = np.mean(score)
    total = len(df_generated[df_generated['score'] != 0])
    
    print(f'Average_score: {average_score}')
    print(f'Length: {total} / {len(df_generated)}')

## Model definition

### Question generation $f_\theta(P, Q)$ with text passage $P$ and question $Q$

### BERT2BERT Bert-Tiny

In [None]:
from transformers import EncoderDecoderModel, AutoTokenizer
from tqdm import tqdm

model_name = 'prajjwal1/bert-tiny'
# tie_encoder_decoder to share weights and half the number of parameters
model = EncoderDecoderModel.from_encoder_decoder_pretrained(model_name, model_name,
                                                                        tie_encoder_decoder=True)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# set special tokens
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.eos_token_id = tokenizer.sep_token_id
model.config.pad_token_id = tokenizer.pad_token_id

# set decoding params                               
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.repetition_penalty = 5.0
model.config.num_beams = 2
model.config.vocab_size = model.config.encoder.vocab_size

In [None]:
#fix seed 
fix_seed(42)
#take a subset from the training set
max_train = len(train_df)
t_start = 0
t_end = 80000
#take a subset from the validation set
max_val = len(val_df)
v_start = 0
v_end = 20000
print(f'Training set shape: {(t_end-t_start)}\nValidation set shape: {(v_end-v_start)}')

In [None]:
def encodings(tokenizer,encoder_max_length, decoder_max_length, questions, contexts, answers):
  encodings = tokenizer(questions, contexts, 
                            padding=True,
                            truncation= 'only_second',
                            max_length = encoder_max_length,
                            )
  input_ids, input_attention_mask = encodings['input_ids'], encodings['attention_mask']
  label_values = tokenizer(answers,
                            padding=True,
                            truncation=True,
                            max_length = decoder_max_length,
                            )
  labels, labels_mask = label_values['input_ids'], label_values['attention_mask']
  #Tokens with indices set to ``-100`` are ignored (masked) during training, the loss is only computed for the tokens with labels
  masked_labels = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in labels]

  #add the labels to the batch encodings dictionary, then this will be used to create a pytorch dataset
  encodings.pop('token_type_ids')
  encodings.update({'labels': masked_labels})
  return encodings

In [None]:
encoder_max_length = 499
decoder_max_length = 25
train_encodings = encodings(tokenizer, encoder_max_length, decoder_max_length,
                            list(train_df[t_start:t_end]['q']), 
                            list(train_df[t_start:t_end]['p']),
                            list(train_df[t_start:t_end]['a']))
val_encodings = encodings(tokenizer, encoder_max_length, decoder_max_length,
                            list(val_df[v_start:v_end]['q']), 
                            list(val_df[v_start:v_end]['p']),
                            list(val_df[v_start:v_end]['a']))

In [None]:
def execute(model: torch.nn.Module,
            starting_lr: float, 
            num_epochs: int, 
            data_loader_train: torch.utils.data.DataLoader,
            data_loader_val: torch.utils.data.DataLoader) -> None:
    """Executes the training loop.

    Args:
        name_train: the name for the log subfolder.
        model: the network to train.
        starting_lr: the staring learning rate.
        num_epochs: the number of epochs.
        data_loader_train: the data loader with training data.
        data_loader_val: the data loader with validation data.
    """
    #Optimizer
    optimizer = torch.optim.AdamW(model.parameters(), lr=starting_lr)
    print(f'Start training.')
    statistics = training_loop(num_epochs, optimizer, model,
                               data_loader_train, data_loader_val)
    print(f'Training complete.')


In [None]:
#parameters
batch_size = 16
num_epochs = 3
lr = 4e-4

In [None]:
#create training dataset
train_dataset = CreateDataset(train_encodings)
#create training dataloader
train_ld = torch.utils.data.DataLoader(train_dataset,
                                     batch_size=batch_size,
                                     )
#create validation dataset
val_dataset = CreateDataset(val_encodings)
#create validation dataloader
val_ld = torch.utils.data.DataLoader(val_dataset,
                                     batch_size=batch_size,
                                     )
#execute
execute(model, lr, num_epochs, train_ld, val_ld)

In [None]:
del train_encodings, val_encodings
free_some_memory()

#### Generation

Encode test set using the tokenizer defined before.

In [None]:
df_generated = generate_answers(model=model, 
                                df_input=test_df,
                                max_length = 499,
                                tokenizer=tokenizer)

In [None]:
pd.set_option('display.max_rows', None)
correct_answers = df_generated[df_generated['score'] != 0].reset_index(drop=True)
correct_answers.head(200)

In [None]:
# del tokenizer, input_values, input_ids, input_attention_mask, model, l, x, correct
del tokenizer, model, df_generated, correct_answers

free_some_memory()

### BERT2BERT Distilroberta-base

In [None]:
from transformers import EncoderDecoderModel, AutoTokenizer
from tqdm import tqdm


model_name = 'distilroberta-base'

# tie_encoder_decoder to share weights and half the number of parameters
model = EncoderDecoderModel.from_encoder_decoder_pretrained(model_name, model_name, tie_encoder_decoder=True)

tokenizer = AutoTokenizer.from_pretrained(model_name)

# set special tokens
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.eos_token_id = tokenizer.sep_token_id
model.config.pad_token_id = tokenizer.pad_token_id

# set decoding params                               
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.repetition_penalty = 5.0
model.config.num_beams = 2
model.config.vocab_size = model.config.encoder.vocab_size


In [None]:
#fix seed 
fix_seed(42)
#take a subset from the training set
max_train = len(train_df)
t_start = 0
t_end = 60000
#take a subset from the validation set
max_val = len(val_df)
v_start = 0
v_end = 20000
print(f'Training set shape: {(t_end-t_start)}\nValidation set shape: {(v_end-v_start)}')

In [None]:
def encodings(tokenizer,encoder_max_length, decoder_max_length, questions, contexts, answers):
  encodings = tokenizer(questions, contexts, 
                            padding=True,
                            truncation= 'only_second',
                            max_length = encoder_max_length,
                            )
  input_ids, input_attention_mask = encodings['input_ids'], encodings['attention_mask']
  label_values = tokenizer(answers,
                            padding=True,
                            truncation=True,
                            max_length = decoder_max_length,
                            )
  labels, labels_mask = label_values['input_ids'], label_values['attention_mask']
  #Tokens with indices set to ``-100`` are ignored (masked) during training, the loss is only computed for the tokens with labels
  masked_labels = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in labels]

  #add the masked_labels to the batch encodings dictionary, then this will be used to create a pytorch dataset
  encodings.update({'labels': masked_labels})
  return encodings

In [None]:
encoder_max_length = 512
decoder_max_length = 25
train_encodings = encodings(tokenizer, encoder_max_length, decoder_max_length,
                            list(train_df[t_start:t_end]['q']), 
                            list(train_df[t_start:t_end]['p']),
                            list(train_df[t_start:t_end]['a']))
val_encodings = encodings(tokenizer, encoder_max_length, decoder_max_length,
                            list(val_df[v_start:v_end]['q']), 
                            list(val_df[v_start:v_end]['p']),
                            list(val_df[v_start:v_end]['a']))

In [None]:
def execute(model: torch.nn.Module,
            starting_lr: float, 
            num_epochs: int, 
            data_loader_train: torch.utils.data.DataLoader,
            data_loader_val: torch.utils.data.DataLoader) -> None:
    """Executes the training loop.

    Args:
        name_train: the name for the log subfolder.
        model: the network to train.
        starting_lr: the staring learning rate.
        num_epochs: the number of epochs.
        data_loader_train: the data loader with training data.
        data_loader_val: the data loader with validation data.
    """
    # Optimizer
    optimizer = torch.optim.AdamW(model.parameters(), lr=starting_lr)
    print(f'Start training.')
    statistics = training_loop(num_epochs, optimizer, model, data_loader_train, data_loader_val)
    print(f'Training complete.')

In [None]:
#parameters
batch_size = 16
num_epochs = 3
#also try with lr = 4e-4
lr = 4e-5

In [None]:
#create training dataset
train_dataset = CreateDataset(train_encodings)
#create training dataloader
train_ld = torch.utils.data.DataLoader(train_dataset,
                                     batch_size=batch_size,
                                     )
#create validation dataset
val_dataset = CreateDataset(val_encodings)
#create validation dataloader
val_ld = torch.utils.data.DataLoader(val_dataset,
                                     batch_size=batch_size,
                                     )
#execute
execute(model, lr, num_epochs, train_ld, val_ld)

In [None]:
del train_encodings, val_encodings
free_some_memory()

#### Generation

Encode test set using the tokenizer defined before.

In [None]:
df_generated = generate_answers(model=model, 
                                df_input=test_df,
                                max_length = 512,
                                tokenizer=tokenizer)

In [None]:
pd.set_option('display.max_rows', None)
correct_answers = df_generated[df_generated['score'] != 0]
correct_answers = correct_answers.reset_index(drop=True)
correct_answers.head(200)

### Question generation $f_\theta(P, Q, H)$ with text passage $P$, question $Q$ and dialogue history $H$

In [None]:
# TODO

## Train and evaluate $f_\theta(P, Q)$ and $f_\theta(P, Q, H)$

In [None]:
# TODO

## Conclusions

In [None]:
# TODO