[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Danysan1/ai-unibo-nlp-project/blob/main/a2/execution.ipynb)
[![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/Danysan1/ai-unibo-nlp-project/blob/main/a2/execution.ipynb)

# Assignment 2 execution

In [None]:
%pip install pandas numpy matplotlib transformers==4.25.1  dataset tensorflow_addons

## Data loading

### Dataset download

In [None]:
import os
import urllib.request
from tqdm import tqdm

class DownloadProgressBar(tqdm):
    def update_to(self, b=1, bsize=1, tsize=None):
        if tsize is not None:
            self.total = tsize
        self.update(b * bsize - self.n)
        
def download_url(url, output_path):
    with DownloadProgressBar(unit='B', unit_scale=True,
                             miniters=1, desc=url.split('/')[-1]) as t:
        urllib.request.urlretrieve(url, filename=output_path, reporthook=t.update_to)

def download_data(data_path, url_path, suffix):    
    if not os.path.exists(data_path):
        os.makedirs(data_path)
        
    data_path = os.path.join(data_path, f'{suffix}.json')
    if not os.path.exists(data_path):
        print(f"Downloading CoQA {suffix} data split... (it may take a while)")
        download_url(url=url_path, output_path=data_path)
        print("Download completed!")

In [None]:
data_folder = 'Dataset'

In [None]:
# Train & Validation data
train_url = "https://nlp.stanford.edu/data/coqa/coqa-train-v1.0.json"
download_data(data_path=data_folder, url_path=train_url, suffix='train')

# Test data
test_url = "https://nlp.stanford.edu/data/coqa/coqa-dev-v1.0.json"
download_data(data_path=data_folder, url_path=test_url, suffix='test')

### Dataset loading

In [None]:
import numpy as np
import pandas as pd
import json
from os import path
from matplotlib import pyplot as plt

In [None]:
def loadDataset(filename):
    with open(path.join(data_folder, filename)) as file_obj:
        df = json.load(file_obj)["data"]
    print(f'{len(df)} stories / {len(df[0]["questions"])} questions in the first row')

    storyDType = pd.CategoricalDtype(pd.unique([story["story"] for story in df]))
    print(f"{storyDType.categories.size} distinct stories")

    sourceDType = pd.CategoricalDtype(pd.unique([story["source"] for story in df]))
    print(f"{sourceDType.categories.size} distinct sources: {sourceDType.categories}")

    df = np.array([
        [
            sourceDType.categories.get_loc(story["source"]), # Sources factorization
            storyDType.categories.get_loc(story["story"]), # Sources factorization
            story["questions"][question_index]["input_text"],
            story["answers"][question_index]["input_text"],
            story["answers"][question_index]["span_text"],
        ]
        for story in df
        for question_index in range(len(story["questions"]))
        if story["answers"][question_index]["input_text"] != 'unknown'
    ])
    print(f'{df.shape} question-answer pairs x columns')
    print(f'First row: {df[0]}')
    
    # https://marcobonzanini.com/2021/09/15/tips-for-saving-memory-with-pandas/
    # https://pandas.pydata.org/docs/user_guide/categorical.html
    df = pd.DataFrame({
        "source": pd.Series(pd.Categorical.from_codes(df[:,0].astype(np.int16), dtype=sourceDType)),
        "p": pd.Series(pd.Categorical.from_codes(df[:,1].astype(np.int16), dtype=storyDType)),
        "q": df[:,2],
        "a": df[:,3],
        "span": df[:,4],
    })

    return df

In [None]:
train_df = loadDataset("train.json")
train_df.count()

In [None]:
pd.unique(train_df["p"]).size

In [None]:
pd.unique(train_df["span"]).size

In [None]:
pd.unique(train_df["source"]).size

In [None]:
train_df.head()

In [None]:
train_df.memory_usage(deep=True)

In [None]:
test_df = loadDataset("test.json")
test_df.count()

## Data Pre-Processing

### Check unanswerable questions in the Train Dataset

In [None]:
idx = (train_df.a == 'unknown')
unanswerable = train_df[idx]
unanswerable.q.count()

All unanswerable questions in the Train Dataset have been already removed.

## Exploratory Data Analysis

In [None]:
train_df["p"][42]

In [None]:
train_df["q"][42]

In [None]:
train_df["a"][42]

In [None]:
train_df["span"][42]

In [None]:
train_df["source"][42]

### Distribution statistics

Sources:

In [None]:
train_df["source"].hist()

Occurrences of 25 most popular stories:

In [None]:
story_counts = train_df["p"].cat.codes.value_counts(sort=True)
story_counts[:25].plot(kind="bar", figsize=(15,5))

Occurrences of 25 least popular stories:

In [None]:
story_counts[-25:-1].plot(kind="bar", figsize=(15,5))

Histogram of story popularities:

In [None]:
story_counts.hist(log=True,bins=75,figsize=(15,5))

### Removing rows with outlier story lengths to save memory

In [None]:
train_df.count()

LOGARITHMIC histogram of story length:

In [None]:
story_lengths = train_df["p"].str.len()
story_lengths.hist(log=True,bins=75,figsize=(15,5))

In [None]:
p_length_limit = story_lengths.quantile(0.999)
p_length_limit

In [None]:
p_length_mask = story_lengths < p_length_limit
p_length_mask.value_counts()

In [None]:
train_df = train_df[p_length_mask]
train_df.count()

### Removing rows with outlier question/answer/span lengths to save memory

LOGARITHMIC histogram of question length:

In [None]:
question_lengths = train_df["q"].str.len()
question_lengths.hist(log=True,bins=75,figsize=(15,5))

In [None]:
q_length_limit = question_lengths.quantile(0.999)
q_length_limit

LOGARITHMIC histogram of answer length:

In [None]:
answer_lengths = train_df["a"].str.len()
answer_lengths.hist(log=True,bins=75,figsize=(15,5))

In [None]:
a_length_limit = answer_lengths.quantile(0.999)
a_length_limit

In [None]:
span_lengths = train_df["span"].str.len()
span_lengths.hist(log=True,bins=75,figsize=(15,5))

In [None]:
span_length_limit = span_lengths.quantile(0.999)
span_length_limit

In [None]:
bad_length_mask = (question_lengths > q_length_limit) | (answer_lengths > a_length_limit) | (span_lengths > span_length_limit)
bad_length_mask.value_counts()

In [None]:
excluded_stories = train_df["p"][bad_length_mask].unique()
len(excluded_stories)

In [None]:
excluded_mask = ~train_df["p"].isin(excluded_stories)
excluded_mask.value_counts()

In [None]:
train_df = train_df[excluded_mask]
train_df.count()

## Train-Validation-Test split

In [None]:
train_df = train_df.reset_index()

In [None]:
total_rows = len(train_df)
total_rows

In [None]:
ideal_split_index = int(total_rows * 0.8)
ideal_split_index

In [None]:
train_df[ ideal_split_index-3 : ideal_split_index+1 ]

In [None]:
before_split_mask = pd.Series(np.linspace(0, total_rows, total_rows)) < ideal_split_index
before_split_mask.value_counts()

In [None]:
split_story = train_df["p"][ideal_split_index - 1]
split_story_mask = train_df["p"] == split_story
split_story_mask.value_counts()

In [None]:
train_mask = before_split_mask | split_story_mask
train_mask.value_counts()

In [None]:
val_df = train_df[~train_mask]
train_df = train_df[train_mask]
len(val_df)

In [None]:
train_df.memory_usage()

In [None]:
val_df.memory_usage()

In [None]:
test_df.memory_usage()

## Model definition

### Utilities

In [None]:
from sklearn.metrics import f1_score
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from typing import List, Dict, Callable
import random

In [None]:
def predict_data(model: keras.Model,
                x: np.ndarray,
                prediction_info: Dict):
    """
    Inference routine of a given input set of examples

    :param model: Keras built and possibly trained model
    :param x: input set of examples in np.ndarray format
    :param prediction_info: dictionary storing model predict() argument information

    :return
        predictions: predicted labels in np.ndarray format
    """
    print(f'Starting prediction: \n{prediction_info}')
    print(f'Predicting on {x.shape[0]} samples')
    predictions = model.predict(x, **prediction_info)
    return predictions

In [None]:
def compute_f1(model: keras.Model, 
             x: np.ndarray, 
             y: np.ndarray):
    """
    Compute F1_score on the given data with corresponding labels

    :param model: Keras built and possibly trained model
    :param x: data in np.ndarray format
    :param y: ground-truth labels in np.ndarray format

    :return
        score: f1_macro_score
    """
    #predictions on the x set
    prediction_info = {
        'batch_size': 64,
        'verbose': 1
    }
    y_pred = predict_data(model=model, x=x, prediction_info=prediction_info)

    #compute argmax to take the best class for each sample
    y_pred = np.argmax(y_pred, axis=1)
    #compute the f1_macro
    score = f1_score(y, y_pred, average ='macro')
    return score

In [None]:
def set_reproducibility(seed):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'
    tf.config.experimental.enable_op_determinism()

In [None]:
import tensorflow as tf
import tensorflow_addons as tfa
from tqdm import tqdm
from transformers import TFAutoModel, AutoTokenizer

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

### Question generation $f_\theta(P, Q)$ with text passage $P$ and question $Q$

### Seq2Seq LSTM

In [None]:
class MyTrainer(object):
    """
    Simple wrapper class

    train_op -> uses tf.GradientTape to compute the loss
    batch_fit -> receives a batch and performs forward-backward passes (gradient included)
    """

    def __init__(self, encoder, decoder, max_length):
        self.encoder = encoder
        self.decoder = decoder
        self.max_length = max_length
        self.ce = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=1e-03)

    @tf.function
    def compute_loss(self, logits, target):
        loss = self.ce(y_true=target, y_pred=logits)
        mask = tf.logical_not(tf.math.equal(target, 0))
        mask = tf.cast(mask, dtype=loss.dtype)
        loss *= mask
        return tf.reduce_mean(loss)

    @tf.function
    def train_op(self, inputs):
        with tf.GradientTape() as tape:
            encoder_output, encoder_h, encoder_s = self.encoder({'input_ids': inputs['encoder_input_ids'],
                                                                 'hidden_state': inputs['encoder_state']})

            decoder_input = inputs['decoder_target'][:, :-1]  # ignore <end>
            real_target = inputs['decoder_target'][:, 1:]  # ignore <start>

            decoder.attention.setup_memory(encoder_output)

            decoder_initial_state = self.decoder.build_initial_state(decoder.batch_size, [encoder_h, encoder_s])
            predicted = self.decoder({'input_ids': decoder_input,
                                      'initial_state': decoder_initial_state}).rnn_output

            loss = self.compute_loss(logits=predicted, target=real_target)

        grads = tape.gradient(loss, self.encoder.trainable_variables + self.decoder.trainable_variables)
        return loss, grads

    @tf.function
    def batch_fit(self, inputs):
        loss, grads = self.train_op(inputs=inputs)
        self.optimizer.apply_gradients(zip(grads, self.encoder.trainable_variables + self.decoder.trainable_variables))
        return loss

    @tf.function
    def generate(self, input_ids):
        batch_size = input_ids.shape[0]
        encoder_initial_state = [tf.zeros((batch_size, self.encoder.encoder_units)),
                                 tf.zeros((batch_size, self.encoder.encoder_units))]
        encoder_output, encoder_h, encoder_s = self.encoder({
            'input_ids': input_ids,
            'hidden_state': encoder_initial_state
        })

        start_tokens = tf.fill([batch_size], tokenizer.word_index['<start>'])
        end_token = tokenizer.word_index['<end>']

        greedy_sampler = tfa.seq2seq.GreedyEmbeddingSampler()
        decoder_instance = tfa.seq2seq.BasicDecoder(cell=self.decoder.wrapped_decoder_cell,
                                                    sampler=greedy_sampler,
                                                    output_layer=self.decoder.generation_dense,
                                                    maximum_iterations=self.max_length)
        self.decoder.attention.setup_memory(encoder_output)

        decoder_initial_state = self.decoder.build_initial_state(batch_size, [encoder_h, encoder_s])
        decoder_embedding_matrix = self.decoder.embedding.variables[0]
        outputs, _, _ = decoder_instance(decoder_embedding_matrix,
                                         start_tokens=start_tokens,
                                         end_token=end_token,
                                         initial_state=decoder_initial_state)
        return outputs

    def translate(self, generated):
        return tokenizer.sequences_to_texts(generated.sample_id.numpy())


class Encoder(tf.keras.Model):

    def __init__(self, vocab_size, embedding_dim, encoder_units):
        super(Encoder, self).__init__()

        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.encoder_units = encoder_units

        self.embedding = tf.keras.layers.Embedding(input_dim=vocab_size,
                                                   output_dim=embedding_dim)
        self.encoder_lstm = tf.keras.layers.LSTM(self.encoder_units,
                                                 return_sequences=True,
                                                 return_state=True)

    def call(self, inputs, training=False, **kwargs):
        input_ids = inputs['input_ids']
        input_emb = self.embedding(input_ids)
        encoder_output, lstm_hidden, lstm_states = self.encoder_lstm(input_emb, initial_state=inputs['hidden_state'])
        return encoder_output, lstm_hidden, lstm_states

    def initialize(self, batch_size):
        return [tf.zeros((batch_size, self.encoder_units)), tf.zeros((batch_size, self.encoder_units))]


class Decoder(tf.keras.Model):

    def __init__(self, vocab_size, max_sequence_length, embedding_dim, decoder_units, batch_size):
        super(Decoder, self).__init__()

        self.max_sequence_length = max_sequence_length
        self.batch_size = batch_size

        self.decoder_units = decoder_units
        self.embedding = tf.keras.layers.Embedding(input_dim=vocab_size,
                                                   output_dim=embedding_dim)
        self.decoder_lstm_cell = tf.keras.layers.LSTMCell(self.decoder_units)

        self.attention = tfa.seq2seq.BahdanauAttention(units=self.decoder_units,
                                                       memory=None,
                                                       memory_sequence_length=self.batch_size * [max_sequence_length])

        self.wrapped_decoder_cell = tfa.seq2seq.AttentionWrapper(self.decoder_lstm_cell,
                                                                 self.attention,
                                                                 attention_layer_size=self.decoder_units)

        self.generation_dense = tf.keras.layers.Dense(vocab_size)
        self.sampler = tfa.seq2seq.sampler.TrainingSampler()
        self.decoder = tfa.seq2seq.BasicDecoder(self.wrapped_decoder_cell,
                                                sampler=self.sampler,
                                                output_layer=self.generation_dense)

    def build_initial_state(self, batch_size, encoder_state):
        initial_state = self.wrapped_decoder_cell.get_initial_state(batch_size=batch_size, dtype=tf.float32)
        initial_state = initial_state.clone(cell_state=encoder_state)
        return initial_state

    def call(self, inputs, training=False, **kwargs):
        input_ids = inputs['input_ids']
        input_emb = self.embedding(input_ids)
        decoder_output, _, _ = self.decoder(input_emb,
                                            initial_state=inputs['initial_state'],
                                            sequence_length=self.batch_size * [self.max_sequence_length - 1])
        return decoder_output

In [None]:
# Sample
input_sample = [
    "hello there how is it going",
    "this assignment is hellish"
]
output_sample = [
    "<start> it is going well <end>",
    "<start> I agree <end>"
]

batch_size = len(input_sample)

tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='<UNK>')
tokenizer.fit_on_texts(input_sample + output_sample)

vocab_size = len(tokenizer.word_index) + 1

encoded_input_sample = tokenizer.texts_to_sequences(input_sample)
max_input_length = max([len(item) for item in encoded_input_sample])

encoded_output_sample = tokenizer.texts_to_sequences(output_sample)
max_output_length = max([len(item) for item in encoded_output_sample])

max_sequence_length = max(max_input_length, max_output_length)

encoded_input_sample = tf.keras.preprocessing.sequence.pad_sequences(encoded_input_sample,
                                                                        padding='post',
                                                                        maxlen=max_sequence_length)
encoded_output_sample = tf.keras.preprocessing.sequence.pad_sequences(encoded_output_sample,
                                                                        padding='post',
                                                                        maxlen=max_sequence_length)

# Test encoder
encoder = Encoder(vocab_size=vocab_size,
                    embedding_dim=50,
                    encoder_units=16)

sample_hidden = encoder.initialize(batch_size=batch_size)
encoder_sample_batch = {
    'input_ids': tf.convert_to_tensor(encoded_input_sample, dtype=tf.int32),
    'hidden_state': sample_hidden
}

sample_output, sample_h, sample_c = encoder(inputs=encoder_sample_batch)
print(f'{sample_output.shape} -- {sample_h.shape} -- {sample_c.shape}')

# Test decoder
decoder = Decoder(vocab_size=vocab_size,
                    embedding_dim=50,
                    decoder_units=16,
                    batch_size=batch_size,
                    max_sequence_length=max_sequence_length)
decoder.attention.setup_memory(sample_output)
initial_state = decoder.build_initial_state(batch_size, [sample_h, sample_c])

decoder_sample_batch = {
    'input_ids': tf.convert_to_tensor(encoded_output_sample, tf.int32),
    'initial_state': initial_state
}
sample_decoder_outputs = decoder(decoder_sample_batch).rnn_output
print(f'{sample_decoder_outputs.shape}')

In [None]:
# Training
trainer = MyTrainer(encoder=encoder,
                    decoder=decoder,
                    max_length=max_sequence_length)

In [None]:
epochs = 100
for epoch in tqdm(range(epochs)):
    encoder_hidden_state = encoder.initialize(batch_size=batch_size)
    batch = {
        'encoder_input_ids': encoded_input_sample,
        'encoder_state': encoder_hidden_state,
        'decoder_target': encoded_output_sample
    }
    loss = trainer.batch_fit(batch)
    print(f'Loss - {loss}')

    generated = trainer.generate(input_ids=encoded_input_sample)
    translated = trainer.translate(generated)
    print(f'Translated - {translated}')

In [None]:
#TODO

### Seq2Seq Bert-Tiny

In [None]:
class MyTrainer(object):
    """
    Simple wrapper class

    train_op -> uses tf.GradientTape to compute the loss
    batch_fit -> receives a batch and performs forward-backward passes (gradient included)
    """

    def __init__(self, encoder, decoder, max_length):
        self.encoder = encoder
        self.decoder = decoder
        self.max_length = max_length
        self.ce = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=1e-03)

    @tf.function
    def compute_loss(self, logits, target):
        loss = self.ce(y_true=target, y_pred=logits)
        mask = tf.logical_not(tf.math.equal(target, 0))
        mask = tf.cast(mask, dtype=loss.dtype)
        loss *= mask
        return tf.reduce_mean(loss)

    @tf.function
    def train_op(self, inputs):
        with tf.GradientTape() as tape:
            encoder_output, encoder_h, encoder_s = self.encoder({'input_ids': inputs['encoder_input_ids'],
                                                                 'attention_mask': inputs['encoder_attention_mask']})

            decoder_input = inputs['decoder_target'][:, :-1]  # ignore <end>
            real_target = inputs['decoder_target'][:, 1:]  # ignore <start>

            decoder.attention.setup_memory(encoder_output)

            decoder_initial_state = self.decoder.build_initial_state(decoder.batch_size, [encoder_h, encoder_s])
            predicted = self.decoder({'input_ids': decoder_input,
                                      'initial_state': decoder_initial_state}).rnn_output

            loss = self.compute_loss(logits=predicted, target=real_target)

        grads = tape.gradient(loss, self.encoder.trainable_variables + self.decoder.trainable_variables)
        return loss, grads

    @tf.function
    def batch_fit(self, inputs):
        loss, grads = self.train_op(inputs=inputs)
        self.optimizer.apply_gradients(zip(grads, self.encoder.trainable_variables + self.decoder.trainable_variables))
        return loss

    # @tf.function
    def generate(self, input_ids, attention_mask=None):
        batch_size = input_ids.shape[0]
        encoder_output, encoder_h, encoder_s = self.encoder({
            'input_ids': input_ids,
            'attention_mask': attention_mask
        })

        start_tokens = tf.fill([batch_size], output_tokenizer.word_index['<start>'])
        end_token = output_tokenizer.word_index['<end>']

        greedy_sampler = tfa.seq2seq.GreedyEmbeddingSampler()
        decoder_instance = tfa.seq2seq.BasicDecoder(cell=self.decoder.wrapped_decoder_cell,
                                                    sampler=greedy_sampler,
                                                    output_layer=self.decoder.generation_dense,
                                                    maximum_iterations=self.max_length)
        self.decoder.attention.setup_memory(encoder_output)

        decoder_initial_state = self.decoder.build_initial_state(batch_size, [encoder_h, encoder_s])
        decoder_embedding_matrix = self.decoder.embedding.variables[0]
        outputs, _, _ = decoder_instance(decoder_embedding_matrix,
                                         start_tokens=start_tokens,
                                         end_token=end_token,
                                         initial_state=decoder_initial_state)
        return outputs

    def translate(self, generated):
        return output_tokenizer.sequences_to_texts(generated.sample_id.numpy())


class Encoder(tf.keras.Model):

    def __init__(self, model_name, decoder_units):
        super(Encoder, self).__init__()
        self.model = TFAutoModel.from_pretrained(model_name, from_pt=True)
        self.reducer = tf.keras.layers.Dense(decoder_units)

    def call(self, inputs, training=False, **kwargs):
        model_output = self.model(inputs)
        all_outputs = model_output[0]
        pooled_output = model_output[1]
        pooled_output = self.reducer(pooled_output)
        return all_outputs, pooled_output, pooled_output


class Decoder(tf.keras.Model):

    def __init__(self, vocab_size, max_sequence_length, embedding_dim, decoder_units, batch_size):
        super(Decoder, self).__init__()

        self.max_sequence_length = max_sequence_length
        self.batch_size = batch_size

        self.decoder_units = decoder_units
        self.embedding = tf.keras.layers.Embedding(input_dim=vocab_size,
                                                   output_dim=embedding_dim)
        self.decoder_lstm_cell = tf.keras.layers.LSTMCell(self.decoder_units)

        self.attention = tfa.seq2seq.BahdanauAttention(units=self.decoder_units,
                                                       memory=None,
                                                       memory_sequence_length=self.batch_size * [max_sequence_length])

        self.wrapped_decoder_cell = tfa.seq2seq.AttentionWrapper(self.decoder_lstm_cell,
                                                                 self.attention,
                                                                 attention_layer_size=self.decoder_units)

        self.generation_dense = tf.keras.layers.Dense(vocab_size)
        self.sampler = tfa.seq2seq.sampler.TrainingSampler()
        self.decoder = tfa.seq2seq.BasicDecoder(self.wrapped_decoder_cell,
                                                sampler=self.sampler,
                                                output_layer=self.generation_dense)

    def build_initial_state(self, batch_size, encoder_state):
        initial_state = self.wrapped_decoder_cell.get_initial_state(batch_size=batch_size, dtype=tf.float32)
        initial_state = initial_state.clone(cell_state=encoder_state)
        return initial_state

    def call(self, inputs, training=False, **kwargs):
        input_ids = inputs['input_ids']
        input_emb = self.embedding(input_ids)
        decoder_output, _, _ = self.decoder(input_emb,
                                            initial_state=inputs['initial_state'],
                                            sequence_length=self.batch_size * [self.max_sequence_length - 1])
        return decoder_output


In [None]:
from transformers import BertForQuestionAnswering, AutoTokenizer, AutoConfig

model_name = 'prajjwal1/bert-tiny'

#config = AutoConfig.from_pretrained(model_name)
#model = BertForQuestionAnswering.from_pretrained(model_name, config=config)
input_tokenizer = AutoTokenizer.from_pretrained(model_name)

The next block of code is an example of encoding of a question-context pair: in this case, the question is the first part of the encoding, and the context is the second part. There are two special tokens: [CLS] token at the start of the encoding, [SEP] token between the question and the context, and at the end of the encoding.

In this case the context is the *span*, to provide a better example that explains the encoding.

In [None]:
line = 42

encoded_question = input_tokenizer(train_df['q'][line], return_tensors='tf', padding=True)
print(train_df['q'][line])

encoded_span = input_tokenizer(train_df['span'][line], return_tensors='tf', padding=True)
print(train_df['span'][line])

encoded_qs = input_tokenizer(train_df['q'][line], train_df['span'][line], return_tensors='tf', padding=True)

print('= '*40)
for idx, tok in zip(encoded_qs.input_ids.numpy()[0], input_tokenizer.convert_ids_to_tokens(encoded_qs.input_ids[0])):
    print("{}\t{}".format(idx, tok))

Lets encode a part of the dataset in sentences of: [CLS] question [SEP] passage [SEP]. Otherwise, the training would be very slow.

In [None]:
max_length = 512  # The maximum length of a feature (question and context)
doc_stride = (
    128  # The authorized overlap between two part of the context when splitting
)
sentences = 20
sample = 10

In [None]:
# Input
qs = train_df['q'][range(sentences)] # questions
cs = train_df['p'][range(sentences)] # contexts

batch_size = len(qs)

encoded_inputs = input_tokenizer(
    qs.values.tolist(),
    cs.values.tolist(),
    #train_df['q'].values.tolist(),
    #train_df['p'].values.tolist(),
    truncation="only_second",
    max_length=max_length,
    stride=doc_stride,
    return_overflowing_tokens=True,
    return_offsets_mapping=True,
    padding="max_length",
    return_tensors='tf'
)

input_ids, attention_mask = encoded_inputs.input_ids, encoded_inputs.attention_mask
max_input_length = input_ids.shape[-1]

In [None]:
print("max_input_length:", max_input_length)
print("encoded_inputs shape =", encoded_inputs['input_ids'].shape)

The 'token_type_ids' encodes wether the encoded id is part of the question (=0) or the context (=1). The Attention Mask indicates if the input is needed (=1) or it's padding (=0).

Prepare also the expected outputs, for the training (this code follows the example given by the tutors, but I'm not convinced that this is the proper formatting for a QA Bert model).

In [None]:
# Output
outputs = "<start> " + train_df['a'][range(sentences)] + " <end>"

output_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='<UNK>')
output_tokenizer.fit_on_texts(outputs)

output_vocab_size = len(output_tokenizer.word_index) + 1

encoded_output = output_tokenizer.texts_to_sequences(outputs)
print(encoded_output[sample])
max_output_length = max([len(item) for item in encoded_output])

In [None]:
max_sequence_length = max(max_input_length, max_output_length)

print("max_output_length: {}".format(max_output_length))
print("max_sequence_length: {}".format(max_sequence_length))

In [None]:
encoded_output = tf.keras.preprocessing.sequence.pad_sequences(encoded_output,
                                                                        padding='post',
                                                                        maxlen=max_sequence_length)
print(encoded_output[sample])

In [None]:
# Test encoder
encoder = Encoder(model_name=model_name,
                    decoder_units=16)
encoder_output, encoder_h, encoder_s = encoder({'input_ids': input_ids,
                                                'attention_mask': attention_mask})
print(f'{encoder_output.shape} - {encoder_h.shape} - {encoder_s.shape}')

In [None]:
# Test decoder
decoder = Decoder(vocab_size=output_vocab_size,
                    embedding_dim=50,
                    decoder_units=16,
                    batch_size=batch_size,
                    max_sequence_length=max_sequence_length)
decoder.attention.setup_memory(encoder_output)
initial_state = decoder.build_initial_state(batch_size, [encoder_h, encoder_s])

decoder_batch = {
    'input_ids': tf.convert_to_tensor(encoded_output, tf.int32),
    'initial_state': initial_state
}
decoder_outputs = decoder(decoder_batch).rnn_output
print(f'{decoder_outputs.shape}')

In [None]:
# Training
trainer = MyTrainer(encoder=encoder,
                    decoder=decoder,
                    max_length=max_sequence_length)

In [None]:
epochs = 3
for epoch in tqdm(range(epochs)):
    batch = {
        'encoder_input_ids': input_ids,
        'encoder_attention_mask': attention_mask,
        'decoder_target': encoded_output
    }
    loss = trainer.batch_fit(batch)
    print(f'Loss - {loss}')

    generated = trainer.generate(input_ids=input_ids,
                                    attention_mask=attention_mask)
    translated = trainer.translate(generated)
    print(f'Translated - {translated}')

An example of answered question by the pretrained (*original*) model.

In [None]:
from transformers import TFBertForQuestionAnswering, pipeline

model = TFBertForQuestionAnswering.from_pretrained(model_name, from_pt=True)

question_answerer = pipeline("question-answering", model=model_name)

outputs = question_answerer(question=train_df['q'][0], context=train_df['p'][0])

print("model outputs:", outputs)
print()
print("official results are (from train.json):") 
print("span_start: 151")
print("span_end: 179")
print("span_text: Formally established in 1475")
print("input_text: It was formally established in 1475")
#print("start scores: {}".format(start_scores))
#print("end scores: {}".format(end_scores))

### BERT2BERT Bert-Tiny

In [None]:
#take a subset from the training set
start = 0
end = 60_000
contexts = list(train_df['p'])
questions = list(train_df['q'])
answers = list(train_df['a'])
contexts = contexts[start:end]
questions = questions[start:end]
answers = answers[start:end]
len(contexts)

In [None]:
from transformers import EncoderDecoderModel, AutoTokenizer
from tqdm import tqdm


model_name = 'prajjwal1/bert-tiny'

# tie_encoder_decoder to share weights and half the number of parameters
model = EncoderDecoderModel.from_encoder_decoder_pretrained(model_name, model_name,
                                                                        #encoder_from_pt=True,
                                                                        #decoder_from_pt=True,
                                                                        tie_encoder_decoder=True)

tokenizer = AutoTokenizer.from_pretrained(model_name)

# set special tokens
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.eos_token_id = tokenizer.sep_token_id
model.config.pad_token_id = tokenizer.pad_token_id

# set decoding params                               
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.repetition_penalty = 5.0
model.config.num_beams = 2
model.config.vocab_size = model.config.encoder.vocab_size


In [None]:
encodings = tokenizer(questions, contexts, 
                          padding=True,
                          truncation= 'only_second',
                          max_length = 499,
                          )
input_ids, input_attention_mask = encodings['input_ids'], encodings['attention_mask']
label_values = tokenizer(answers,
                          padding=True,
                          truncation=True,
                          max_length = 25,
                          )
labels, labels_mask = label_values['input_ids'], label_values['attention_mask']



#Tokens with indices set to ``-100`` are ignored (masked) during training, the loss is only computed for the tokens with labels
masked_labels = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in labels]
print(f'length of input_ids: {np.shape(input_ids)}')

In [None]:
encodings.keys()

In [None]:
encodings.pop('token_type_ids')
encodings.update({#'decoder_input_ids': labels,
                 #'decoder_attention_mask': labels_mask,
                 'labels': masked_labels
                 })
encodings.keys()

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class CustomTextDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

In [None]:
#parameters
batch_size = 16
num_epochs = 3
lr = 4e-4

In [None]:
from timeit import default_timer as timer
#create training dataset
train_dataset = CustomTextDataset(encodings)
#create training dataloader
train_ld = torch.utils.data.DataLoader(train_dataset,
                                     batch_size=batch_size,
                                     )

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
optim = torch.optim.AdamW(model.parameters(), lr=lr)
loop_start = timer()
for epoch in range(num_epochs):
    model.train()
    loss_score = []
    loop = tqdm(train_ld)
    for batch in loop:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        loss, outputs = model(input_ids,
                              attention_mask=attention_mask,
                              labels = labels
                        )[:2]
        loss_score.append(loss.item())
        #loss = outputs[0]
        loss.backward()
        optim.step()

        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())
    average_loss = np.mean(loss_score)
    print(f"\nEpoch: {epoch}, average Loss: {average_loss}")
loop_end = timer()
time_loop = loop_end - loop_start
print(f'\nTime for {num_epochs} epochs (s): {(time_loop):.3f}')

In [None]:
#Free some memory
import gc
del encodings,input_ids,input_attention_mask,labels
torch.cuda.empty_cache()
torch.cuda.reset_accumulated_memory_stats()
gc.collect()

#### Generation

Load test dataset.

In [None]:
input_values = tokenizer(list(test_df['q']),list(test_df['p']), padding=True, truncation=True, max_length = 499)
input_ids, input_attention_mask = input_values['input_ids'], input_values['attention_mask']

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
l = []
model.to(device)
model.eval()
for input, mask in zip(input_ids,input_attention_mask):
  input = np.expand_dims(np.array(input), axis=0)
  mask = np.expand_dims(np.array(mask), axis=0)
  generated = model.generate(input_ids=torch.tensor(input).to(device),
                             #attention_mask=torch.tensor(mask).to(device), 
                                                 max_length=20,
                                                 repetition_penalty=5.,
                                                 min_length=1,
                                                 no_repeat_ngram_size=3,
                                                 early_stopping=True,
                                                decoder_start_token_id = model.config.decoder_start_token_id,
                                                 num_beams=2,
                                                 )
  generated = tokenizer.batch_decode(generated, skip_special_tokens=True)
  l.append(generated)

In [None]:
x = pd.DataFrame(l, columns = ['generated'])
x['questions'] = test_df['q']
x['answers'] = test_df['a']
#pd.set_option('display.max_rows', None)
#x.head(300)

In [None]:
#utility functions taken from the allennlp library for computing the F1-score
import collections
import re
import string
from typing import Callable, Sequence, TypeVar, Tuple

def get_tokens(s):
    if not s:
        return []
    return normalize_answer(s).split()

def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""

    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_f1(a_pred: str, a_gold: str) -> float:
    pred_toks = get_tokens(a_pred)
    gold_toks = get_tokens(a_gold)
    common = collections.Counter(pred_toks) & collections.Counter(gold_toks)  # type: ignore[var-annotated]
    num_same = sum(common.values())
    if len(pred_toks) == 0 or len(gold_toks) == 0:
        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
        return float(pred_toks == gold_toks)
    if num_same == 0:
        return 0.0
    precision = 1.0 * num_same / len(pred_toks)
    recall = 1.0 * num_same / len(gold_toks)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

In [None]:
score = []
predictions = x['generated']
true_answers = x['answers']
for a_pred, a_gold in zip(predictions, true_answers):
  score.append(compute_f1(a_pred, a_gold))

average_score = np.mean(score)
print(f'average_score: {average_score}')
x['score'] = score
total = len(x[x['score'] != 0])
print(f'length: {total} / {len(x)}')


In [None]:
pd.set_option('display.max_rows', None)
correct = x[x['score'] != 0].reset_index(drop=True)
correct.head(200)

### BERT2BERT Distilroberta-base

In [None]:
#entire dataset
contexts = list(train_df['p'])
questions = list(train_df['q'])
answers = list(train_df['a'])

In [None]:
#take a subset from the training set
start = 0
end = 60_000
contexts = list(train_df['p'])
questions = list(train_df['q'])
answers = list(train_df['a'])
contexts = contexts[start:end]
questions = questions[start:end]
answers = answers[start:end]
len(contexts)

In [None]:
from transformers import EncoderDecoderModel, AutoTokenizer
from tqdm import tqdm


model_name = 'distilroberta-base'

# tie_encoder_decoder to share weights and half the number of parameters
model = EncoderDecoderModel.from_encoder_decoder_pretrained(model_name, model_name, tie_encoder_decoder=True)

tokenizer = AutoTokenizer.from_pretrained(model_name)

# set special tokens
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.eos_token_id = tokenizer.sep_token_id
model.config.pad_token_id = tokenizer.pad_token_id

# set decoding params                               
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.repetition_penalty = 5.0
model.config.num_beams = 2
model.config.vocab_size = model.config.encoder.vocab_size


In [None]:
encodings = tokenizer(questions, contexts, 
                          padding=True,
                          truncation= True,
                          max_length = 512,
                          )
input_ids, input_attention_mask = encodings['input_ids'], encodings['attention_mask']
label_values = tokenizer(answers,
                          padding=True,
                          truncation=True,
                          max_length = 25,
                          )
labels, labels_mask = label_values['input_ids'], label_values['attention_mask']



#Tokens with indices set to ``-100`` are ignored (masked) during training, the loss is only computed for the tokens with labels
masked_labels = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in labels]
print(f'length of input_ids: {np.shape(input_ids)}')

In [None]:
encodings.keys()

In [None]:
encodings.update({'labels': masked_labels})
encodings.keys()

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class CreateDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

In [None]:
#parameters
batch_size = 12
num_epochs = 3
#also try with lr = 4e-4
lr = 4e-5

In [None]:
from timeit import default_timer as timer
#create training dataset
train_dataset = CreateDataset(encodings)
#create training dataloader
train_ld = torch.utils.data.DataLoader(train_dataset,
                                     batch_size=batch_size,
                                     )

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
optim = torch.optim.AdamW(model.parameters(), lr=lr)
loop_start = timer()
for epoch in range(num_epochs):
    model.train()
    loss_score = []
    loop = tqdm(train_ld)
    for batch in loop:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        loss, outputs = model(input_ids,
                              attention_mask=attention_mask,
                              labels = labels
                        )[:2]
        loss_score.append(loss.item())
        loss.backward()
        optim.step()

        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())
    average_loss = np.mean(loss_score)
    print(f"Epoch: {epoch}, average Loss: {average_loss}")
loop_end = timer()
time_loop = loop_end - loop_start
print(f'\nTime for {num_epochs} epochs (s): {(time_loop):.3f}')

In [None]:
#Free some memory
import gc
del encodings,input_ids,input_attention_mask,labels
torch.cuda.empty_cache()
torch.cuda.reset_accumulated_memory_stats()
gc.collect()

#### Generation

Load test dataset.

In [None]:
input_values = tokenizer(list(test_df['q']),list(test_df['p']), padding=True, truncation=True, max_length = 512)
input_ids, input_attention_mask = input_values['input_ids'], input_values['attention_mask']

In [None]:
#utility functions taken from the allennlp library for computing the F1-score
import collections
import re
import string
from typing import Callable, Sequence, TypeVar, Tuple

def get_tokens(s):
    if not s:
        return []
    return normalize_answer(s).split()

def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""

    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_f1(a_pred: str, a_gold: str) -> float:
    pred_toks = get_tokens(a_pred)
    gold_toks = get_tokens(a_gold)
    common = collections.Counter(pred_toks) & collections.Counter(gold_toks)  # type: ignore[var-annotated]
    num_same = sum(common.values())
    if len(pred_toks) == 0 or len(gold_toks) == 0:
        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
        return float(pred_toks == gold_toks)
    if num_same == 0:
        return 0.0
    precision = 1.0 * num_same / len(pred_toks)
    recall = 1.0 * num_same / len(gold_toks)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
l = []
model.to(device)
#set the model in evaluation mode
model.eval()
for input, mask in zip(input_ids,input_attention_mask):
  input = np.expand_dims(np.array(input), axis=0)
  mask = np.expand_dims(np.array(mask), axis=0)
  generated = model.generate(input_ids=torch.tensor(input).to(device),
                                                 max_length=20,
                                                 repetition_penalty=5.,
                                                 min_length=1,
                                                 no_repeat_ngram_size=3,
                                                 early_stopping=True,
                                                decoder_start_token_id = model.config.decoder_start_token_id,
                                                 num_beams=2,
                                                 )
  generated = tokenizer.batch_decode(generated, skip_special_tokens=True)
  l.append(generated)

In [None]:
x = pd.DataFrame(l, columns = ['generated'])
x['answers'] = test_df['a']
#pd.set_option('display.max_rows', None)
#x.head(300)

In [None]:
score = []
predictions = x['generated']
true_answers = x['answers']
for a_pred, a_gold in zip(predictions, true_answers):
  score.append(compute_f1(a_pred, a_gold))
average_score = np.mean(score)
print(f'average_score: {average_score}')
x['score'] = score
total = len(x[x['score'] != 0])
print(f'length: {total} / {len(x)}')


In [None]:
pd.set_option('display.max_rows', None)
correct = x[x['score'] != 0]
correct = correct.reset_index(drop=True)
correct.head(500)

### Question generation $f_\theta(P, Q, H)$ with text passage $P$, question $Q$ and dialogue history $H$

In [None]:
# TODO

## Train and evaluate $f_\theta(P, Q)$ and $f_\theta(P, Q, H)$

In [None]:
# TODO

## Conclusions

In [None]:
# TODO