<a href="https://colab.research.google.com/github/Danysan1/ai-unibo-nlp-project/blob/main/a2/execution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Assignment 2 execution

In [71]:
%pip install pandas numpy matplotlib transformers

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Note: you may need to restart the kernel to use updated packages.


## Data loading

### Dataset download

In [72]:
import os
import urllib.request
from tqdm import tqdm

class DownloadProgressBar(tqdm):
    def update_to(self, b=1, bsize=1, tsize=None):
        if tsize is not None:
            self.total = tsize
        self.update(b * bsize - self.n)
        
def download_url(url, output_path):
    with DownloadProgressBar(unit='B', unit_scale=True,
                             miniters=1, desc=url.split('/')[-1]) as t:
        urllib.request.urlretrieve(url, filename=output_path, reporthook=t.update_to)

def download_data(data_path, url_path, suffix):    
    if not os.path.exists(data_path):
        os.makedirs(data_path)
        
    data_path = os.path.join(data_path, f'{suffix}.json')
    if not os.path.exists(data_path):
        print(f"Downloading CoQA {suffix} data split... (it may take a while)")
        download_url(url=url_path, output_path=data_path)
        print("Download completed!")

In [73]:
# Train data
train_url = "https://nlp.stanford.edu/data/coqa/coqa-train-v1.0.json"
download_data(data_path='Dataset', url_path=train_url, suffix='train')

# Test data
test_url = "https://nlp.stanford.edu/data/coqa/coqa-dev-v1.0.json"
download_data(data_path='Dataset', url_path=test_url, suffix='test')

### Dataset loading

In [74]:
import numpy as np
import pandas as pd

In [75]:
# TODO

## Exploratory Data Analysis

In [76]:
# TODO

## Train-Validation-Test split

In [77]:
# TODO

## Model definition

### Utilities

In [78]:
from sklearn.metrics import f1_score
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from typing import List, Dict, Callable
import random

In [79]:
def predict_data(model: keras.Model,
                x: np.ndarray,
                prediction_info: Dict):
    """
    Inference routine of a given input set of examples

    :param model: Keras built and possibly trained model
    :param x: input set of examples in np.ndarray format
    :param prediction_info: dictionary storing model predict() argument information

    :return
        predictions: predicted labels in np.ndarray format
    """
    print(f'Starting prediction: \n{prediction_info}')
    print(f'Predicting on {x.shape[0]} samples')
    predictions = model.predict(x, **prediction_info)
    return predictions

In [80]:
def compute_f1(model: keras.Model, 
             x: np.ndarray, 
             y: np.ndarray):
    """
    Compute F1_score on the given data with corresponding labels

    :param model: Keras built and possibly trained model
    :param x: data in np.ndarray format
    :param y: ground-truth labels in np.ndarray format

    :return
        score: f1_macro_score
    """
    #predictions on the x set
    prediction_info = {
        'batch_size': 64,
        'verbose': 1
    }
    y_pred = predict_data(model=model, x=x, prediction_info=prediction_info)

    #compute argmax to take the best class for each sample
    y_pred = np.argmax(y_pred, axis=1)
    #compute the f1_macro
    score = f1_score(y, y_pred, average ='macro')
    return score

In [81]:
def set_reproducibility(seed):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'

In [82]:
import tensorflow as tf
import tensorflow_addons as tfa
from tqdm import tqdm
from transformers import TFAutoModel, AutoTokenizer

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

### Question generation $f_\theta(P, Q)$ with text passage $P$ and question $Q$

In [83]:
class MyTrainer(object):
    """
    Simple wrapper class

    train_op -> uses tf.GradientTape to compute the loss
    batch_fit -> receives a batch and performs forward-backward passes (gradient included)
    """

    def __init__(self, encoder, decoder, max_length):
        self.encoder = encoder
        self.decoder = decoder
        self.max_length = max_length
        self.ce = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=1e-03)

    @tf.function
    def compute_loss(self, logits, target):
        loss = self.ce(y_true=target, y_pred=logits)
        mask = tf.logical_not(tf.math.equal(target, 0))
        mask = tf.cast(mask, dtype=loss.dtype)
        loss *= mask
        return tf.reduce_mean(loss)

    @tf.function
    def train_op(self, inputs):
        with tf.GradientTape() as tape:
            encoder_output, encoder_h, encoder_s = self.encoder({'input_ids': inputs['encoder_input_ids'],
                                                                 'hidden_state': inputs['encoder_state']})

            decoder_input = inputs['decoder_target'][:, :-1]  # ignore <end>
            real_target = inputs['decoder_target'][:, 1:]  # ignore <start>

            decoder.attention.setup_memory(encoder_output)

            decoder_initial_state = self.decoder.build_initial_state(decoder.batch_size, [encoder_h, encoder_s])
            predicted = self.decoder({'input_ids': decoder_input,
                                      'initial_state': decoder_initial_state}).rnn_output

            loss = self.compute_loss(logits=predicted, target=real_target)

        grads = tape.gradient(loss, self.encoder.trainable_variables + self.decoder.trainable_variables)
        return loss, grads

    @tf.function
    def batch_fit(self, inputs):
        loss, grads = self.train_op(inputs=inputs)
        self.optimizer.apply_gradients(zip(grads, self.encoder.trainable_variables + self.decoder.trainable_variables))
        return loss

    @tf.function
    def generate(self, input_ids):
        batch_size = input_ids.shape[0]
        encoder_initial_state = [tf.zeros((batch_size, self.encoder.encoder_units)),
                                 tf.zeros((batch_size, self.encoder.encoder_units))]
        encoder_output, encoder_h, encoder_s = self.encoder({
            'input_ids': input_ids,
            'hidden_state': encoder_initial_state
        })

        start_tokens = tf.fill([batch_size], tokenizer.word_index['<start>'])
        end_token = tokenizer.word_index['<end>']

        greedy_sampler = tfa.seq2seq.GreedyEmbeddingSampler()
        decoder_instance = tfa.seq2seq.BasicDecoder(cell=self.decoder.wrapped_decoder_cell,
                                                    sampler=greedy_sampler,
                                                    output_layer=self.decoder.generation_dense,
                                                    maximum_iterations=self.max_length)
        self.decoder.attention.setup_memory(encoder_output)

        decoder_initial_state = self.decoder.build_initial_state(batch_size, [encoder_h, encoder_s])
        decoder_embedding_matrix = self.decoder.embedding.variables[0]
        outputs, _, _ = decoder_instance(decoder_embedding_matrix,
                                         start_tokens=start_tokens,
                                         end_token=end_token,
                                         initial_state=decoder_initial_state)
        return outputs

    def translate(self, generated):
        return tokenizer.sequences_to_texts(generated.sample_id.numpy())


class Encoder(tf.keras.Model):

    def __init__(self, vocab_size, embedding_dim, encoder_units):
        super(Encoder, self).__init__()

        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.encoder_units = encoder_units

        self.embedding = tf.keras.layers.Embedding(input_dim=vocab_size,
                                                   output_dim=embedding_dim)
        self.encoder_lstm = tf.keras.layers.LSTM(self.encoder_units,
                                                 return_sequences=True,
                                                 return_state=True)

    def call(self, inputs, training=False, **kwargs):
        input_ids = inputs['input_ids']
        input_emb = self.embedding(input_ids)
        encoder_output, lstm_hidden, lstm_states = self.encoder_lstm(input_emb, initial_state=inputs['hidden_state'])
        return encoder_output, lstm_hidden, lstm_states

    def initialize(self, batch_size):
        return [tf.zeros((batch_size, self.encoder_units)), tf.zeros((batch_size, self.encoder_units))]


class Decoder(tf.keras.Model):

    def __init__(self, vocab_size, max_sequence_length, embedding_dim, decoder_units, batch_size):
        super(Decoder, self).__init__()

        self.max_sequence_length = max_sequence_length
        self.batch_size = batch_size

        self.decoder_units = decoder_units
        self.embedding = tf.keras.layers.Embedding(input_dim=vocab_size,
                                                   output_dim=embedding_dim)
        self.decoder_lstm_cell = tf.keras.layers.LSTMCell(self.decoder_units)

        self.attention = tfa.seq2seq.BahdanauAttention(units=self.decoder_units,
                                                       memory=None,
                                                       memory_sequence_length=self.batch_size * [max_sequence_length])

        self.wrapped_decoder_cell = tfa.seq2seq.AttentionWrapper(self.decoder_lstm_cell,
                                                                 self.attention,
                                                                 attention_layer_size=self.decoder_units)

        self.generation_dense = tf.keras.layers.Dense(vocab_size)
        self.sampler = tfa.seq2seq.sampler.TrainingSampler()
        self.decoder = tfa.seq2seq.BasicDecoder(self.wrapped_decoder_cell,
                                                sampler=self.sampler,
                                                output_layer=self.generation_dense)

    def build_initial_state(self, batch_size, encoder_state):
        initial_state = self.wrapped_decoder_cell.get_initial_state(batch_size=batch_size, dtype=tf.float32)
        initial_state = initial_state.clone(cell_state=encoder_state)
        return initial_state

    def call(self, inputs, training=False, **kwargs):
        input_ids = inputs['input_ids']
        input_emb = self.embedding(input_ids)
        decoder_output, _, _ = self.decoder(input_emb,
                                            initial_state=inputs['initial_state'],
                                            sequence_length=self.batch_size * [self.max_sequence_length - 1])
        return decoder_output

In [84]:
# Sample
input_sample = [
    "hello there how is it going",
    "this assignment is hellish"
]
output_sample = [
    "<start> it is going well <end>",
    "<start> I agree <end>"
]

batch_size = len(input_sample)

tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='<UNK>')
tokenizer.fit_on_texts(input_sample + output_sample)

vocab_size = len(tokenizer.word_index) + 1

encoded_input_sample = tokenizer.texts_to_sequences(input_sample)
max_input_length = max([len(item) for item in encoded_input_sample])

encoded_output_sample = tokenizer.texts_to_sequences(output_sample)
max_output_length = max([len(item) for item in encoded_output_sample])

max_sequence_length = max(max_input_length, max_output_length)

encoded_input_sample = tf.keras.preprocessing.sequence.pad_sequences(encoded_input_sample,
                                                                        padding='post',
                                                                        maxlen=max_sequence_length)
encoded_output_sample = tf.keras.preprocessing.sequence.pad_sequences(encoded_output_sample,
                                                                        padding='post',
                                                                        maxlen=max_sequence_length)

# Test encoder
encoder = Encoder(vocab_size=vocab_size,
                    embedding_dim=50,
                    encoder_units=16)

sample_hidden = encoder.initialize(batch_size=batch_size)
encoder_sample_batch = {
    'input_ids': tf.convert_to_tensor(encoded_input_sample, dtype=tf.int32),
    'hidden_state': sample_hidden
}

sample_output, sample_h, sample_c = encoder(inputs=encoder_sample_batch)
print(f'{sample_output.shape} -- {sample_h.shape} -- {sample_c.shape}')

# Test decoder
decoder = Decoder(vocab_size=vocab_size,
                    embedding_dim=50,
                    decoder_units=16,
                    batch_size=batch_size,
                    max_sequence_length=max_sequence_length)
decoder.attention.setup_memory(sample_output)
initial_state = decoder.build_initial_state(batch_size, [sample_h, sample_c])

decoder_sample_batch = {
    'input_ids': tf.convert_to_tensor(encoded_output_sample, tf.int32),
    'initial_state': initial_state
}
sample_decoder_outputs = decoder(decoder_sample_batch).rnn_output
print(f'{sample_decoder_outputs.shape}')

(2, 6, 16) -- (2, 16) -- (2, 16)
(2, 5, 16)


In [85]:
# Training
trainer = MyTrainer(encoder=encoder,
                    decoder=decoder,
                    max_length=max_sequence_length)

In [86]:
epochs = 100
for epoch in tqdm(range(epochs)):
    encoder_hidden_state = encoder.initialize(batch_size=batch_size)
    batch = {
        'encoder_input_ids': encoded_input_sample,
        'encoder_state': encoder_hidden_state,
        'decoder_target': encoded_output_sample
    }
    loss = trainer.batch_fit(batch)
    print(f'Loss - {loss}')

    generated = trainer.generate(input_ids=encoded_input_sample)
    translated = trainer.translate(generated)
    print(f'Translated - {translated}')

  0%|          | 0/100 [00:00<?, ?it/s]

Loss - 2.2167553901672363


  7%|▋         | 7/100 [00:02<00:27,  3.34it/s]

Translated - ['it it how it how <start>', 'i i i i i i']
Loss - 2.212101697921753
Translated - ['i i i i i i', 'i i i i i i']
Loss - 2.2074058055877686
Translated - ['i i i i i i', 'i i i i i i']
Loss - 2.202622652053833
Translated - ['i i i i i i', 'i i i i i i']
Loss - 2.197714328765869
Translated - ['i i i i i i', 'i i i i i i']
Loss - 2.192643165588379
Translated - ['i i i i i i', 'i i i i i i']
Loss - 2.1873672008514404
Translated - ['i i i i i i', 'i i i i i i']
Loss - 2.181844711303711
Translated - ['i i i i i i', 'i i i i i i']
Loss - 2.176034450531006
Translated - ['i i i i i i', 'i i i i i i']
Loss - 2.1698975563049316
Translated - ['i i i i i i', 'i i i i i i']
Loss - 2.163393497467041
Translated - ['i i i i i i', 'i i i i i i']


 19%|█▉        | 19/100 [00:03<00:07, 11.45it/s]

Loss - 2.1564807891845703
Translated - ['i i i i i i', 'i i i i i i']
Loss - 2.1491177082061768
Translated - ['i i i i i i', 'i i i i i i']
Loss - 2.1412577629089355
Translated - ['i i i i i i', 'i i i i i i']
Loss - 2.1328539848327637
Translated - ['i i i i i i', 'i i i i i i']
Loss - 2.123854160308838
Translated - ['i i i i i i', 'i i i i i i']
Loss - 2.114203453063965
Translated - ['i i i i i i', 'i i i i i i']
Loss - 2.1038424968719482
Translated - ['i i i i i i', 'i i i i i i']
Loss - 2.0927093029022217
Translated - ['i i i i i i', 'i i i i i i']
Loss - 2.080737590789795
Translated - ['i i i i i i', 'i i i i i i']
Loss - 2.0678584575653076
Translated - ['i i i i i i', 'i i i i i i']
Loss - 2.0540032386779785
Translated - ['i i i i i i', 'i i i i i i']


 32%|███▏      | 32/100 [00:03<00:02, 23.58it/s]

Loss - 2.039102554321289
Translated - ['i i i i i i', 'i i i i i i']
Loss - 2.0230932235717773
Translated - ['i i i i i i', 'i i i i i i']
Loss - 2.0059216022491455
Translated - ['i i i i i i', 'i i i i i i']
Loss - 1.9875494241714478
Translated - ['i i i i i i', 'i i i i i i']
Loss - 1.9679648876190186
Translated - ['i i i i i i', 'i i i i i i']
Loss - 1.9471944570541382
Translated - ['i i i i i i', 'i i i i i i']
Loss - 1.9253171682357788
Translated - ['i i i i i i', 'i i i i i i']
Loss - 1.9024807214736938
Translated - ['i i i i i i', 'i i i i i i']
Loss - 1.878915786743164
Translated - ['i i i i i i', 'i i i i i i']
Loss - 1.8549442291259766
Translated - ['i i i i i i', 'i i i i i i']
Loss - 1.8309694528579712
Translated - ['i i i i i i', 'i i i i i i']
Loss - 1.8074371814727783
Translated - ['i i i i i i', 'i i i i i i']
Loss - 1.7847537994384766
Translated - ['i i i i i i', 'i i i i i i']


 46%|████▌     | 46/100 [00:03<00:01, 37.60it/s]

Loss - 1.7631763219833374
Translated - ['i i i i i i', 'i i i i i i']
Loss - 1.742719292640686
Translated - ['i i i i i i', 'i i i i i i']
Loss - 1.7231413125991821
Translated - ['i i i i i <end>', 'i i i i <end> <end>']
Loss - 1.7040239572525024
Translated - ['i i i i <end>', 'i i i <end> <end>']
Loss - 1.6849130392074585
Translated - ['i i i <end>', 'i i <end> <end>']
Loss - 1.6654428243637085
Translated - ['i i <end>', 'i i <end>']
Loss - 1.6454098224639893
Translated - ['i i <end>', 'i i <end>']
Loss - 1.624798059463501
Translated - ['going i <end>', 'i <end> <end>']
Loss - 1.6037698984146118
Translated - ['going i <end>', 'i <end> <end>']
Loss - 1.5826255083084106
Translated - ['going going <end>', 'i <end> <end>']
Loss - 1.5617433786392212
Translated - ['going going <end>', 'i <end> <end>']
Loss - 1.5414999723434448
Translated - ['going going <end>', 'i <end> <end>']
Loss - 1.5221741199493408
Translated - ['going going going <end>', 'i <end> <end> <end>']
Loss - 1.503851175308227

 61%|██████    | 61/100 [00:03<00:00, 51.18it/s]

Loss - 1.486371636390686
Translated - ['going going going <end>', '<end> <end> <end> <end>']
Loss - 1.469369888305664
Translated - ['going going going <end>', '<end> <end> <end> <end>']
Loss - 1.4524027109146118
Translated - ['going going going <end>', '<end> <end> <end> <end>']
Loss - 1.4350945949554443
Translated - ['going going going going <end>', '<end> <end> <end> <end> <end>']
Loss - 1.4172313213348389
Translated - ['it going going <end>', '<end> <end> <end> <end>']
Loss - 1.3987889289855957
Translated - ['it going going <end>', '<end> <end> <end> <end>']
Loss - 1.379907250404358
Translated - ['it going going <end>', '<end> <end> <end> <end>']
Loss - 1.3608219623565674
Translated - ['it going going <end>', '<end> <end> <end> <end>']
Loss - 1.3417596817016602
Translated - ['it going going <end>', '<end> <end> <end> <end>']
Loss - 1.322819709777832
Translated - ['it going going <end>', '<end> <end> <end> <end>']
Loss - 1.3038990497589111
Translated - ['it going going <end>', '<end>

 76%|███████▌  | 76/100 [00:03<00:00, 59.66it/s]

Loss - 1.2242052555084229
Translated - ['it it going <end>', '<end> <end> <end> <end>']
Loss - 1.2034533023834229
Translated - ['it it going going <end>', 'i <end> <end> <end> <end>']
Loss - 1.1828190088272095
Translated - ['it it going going <end>', 'i <end> <end> <end> <end>']
Loss - 1.1623501777648926
Translated - ['it it going going <end>', 'i <end> <end> <end> <end>']
Loss - 1.1419302225112915
Translated - ['it it going going <end>', 'i <end> <end> <end> <end>']
Loss - 1.1214234828948975
Translated - ['it it going going <end>', 'i <end> <end> <end> <end>']
Loss - 1.1008204221725464
Translated - ['it it going going <end>', 'i <end> <end> <end> <end>']
Loss - 1.0802757740020752
Translated - ['it it going going <end>', 'i <end> <end> <end> <end>']
Loss - 1.0600214004516602
Translated - ['it it going <end>', 'i <end> <end> <end>']
Loss - 1.040226697921753
Translated - ['it it going <end>', 'i <end> <end> <end>']
Loss - 1.0209081172943115
Translated - ['it it going <end>', 'i <end> <en

 83%|████████▎ | 83/100 [00:04<00:00, 59.64it/s]

Translated - ['it it going <end>', 'i agree <end> <end>']
Loss - 0.9297483563423157
Translated - ['it it going <end>', 'i agree <end> <end>']
Loss - 0.9126607179641724
Translated - ['it it going <end>', 'i agree <end> <end>']
Loss - 0.8957479596138
Translated - ['it it going <end>', 'i agree <end> <end>']
Loss - 0.8789197206497192
Translated - ['it it going <end>', 'i agree <end> <end>']
Loss - 0.8622139096260071
Translated - ['it it going <end>', 'i agree <end> <end>']
Loss - 0.8457285761833191
Translated - ['it it going <end>', 'i agree <end> <end>']
Loss - 0.829515278339386
Translated - ['it it going <end>', 'i agree <end> <end>']
Loss - 0.8135622143745422
Translated - ['it it going <end>', 'i agree <end> <end>']
Loss - 0.7978771924972534
Translated - ['it it going <end>', 'i agree <end> <end>']
Loss - 0.7825426459312439
Translated - ['it is going going <end>', 'i agree <end> <end> <end>']
Loss - 0.7676534652709961
Translated - ['it is going going <end>', 'i agree <end> <end> <end>'

100%|██████████| 100/100 [00:04<00:00, 23.18it/s]

Translated - ['is it going going <end>', 'i agree <end> <end> <end>']
Loss - 0.7391657829284668
Translated - ['is it going well <end>', 'i agree <end> <end> <end>']
Loss - 0.725447416305542
Translated - ['is it going well <end>', 'i agree <end> <end> <end>']
Loss - 0.7120766043663025
Translated - ['is it going well <end>', 'i agree <end> <end> <end>']
Loss - 0.6990617513656616
Translated - ['is it going well <end>', 'i agree <end> <end> <end>']
Loss - 0.6863529086112976
Translated - ['is is going well <end>', 'i agree <end> <end> <end>']
Loss - 0.6738934516906738
Translated - ['is is going well <end>', 'i agree <end> <end> <end>']
Loss - 0.6616845726966858
Translated - ['is is going well <end>', 'i agree <end> <end> <end>']
Loss - 0.6497541666030884
Translated - ['is is going well <end>', 'i agree <end> <end> <end>']
Loss - 0.6380911469459534
Translated - ['is is going well <end>', 'i agree <end> <end> <end>']
Loss - 0.6266587972640991
Translated - ['is is going well <end>', 'i agree <




In [87]:
#TODO

### Question generation $f_\theta(P, Q, H)$ with text passage $P$, question $Q$ and dialogue history $H$

In [88]:
# TODO

## Train and evaluate $f_\theta(P, Q)$ and $f_\theta(P, Q, H)$

In [89]:
# TODO

## Conclusions

In [90]:
# TODO