# Sentence Reconstruction

The purpose of this project is to take in input a sequence of words corresponding to a random permutation of a given english sentence, and reconstruct the original sentence.

The otuput can be either produced in a single shot, or through an iterative (autoregressive) loop generating a single token at a time.


CONSTRAINTS:
* No pretrained model can be used.
* The neural network models should have less the 20M parameters.
* No postprocessing should be done (e.g. no beamsearch)
* You cannot use additional training data.


BONUS PARAMETERS:

A bonus of 0-2 points will be attributed to incentivate the adoption of models with a low number of parameters.

In [69]:
#!pip install datasets
#!pip install --upgrade keras

In [70]:
import tensorflow as tf
import keras
from keras import ops
from keras import layers
from keras.layers import Embedding

from datasets import load_dataset
import string
import re

import numpy as np
import math

### Download the dataset and declare variables

In [71]:
VOCAB_SIZE = 10000
SEQ_LEN = 28
BATCH_SIZE = 256

In [72]:
ds = load_dataset('generics_kb', trust_remote_code=True)['train']
ds = ds.filter(lambda row: len(row["generic_sentence"].split(" ")) > 8 )

### Create the tokenizer and detokenizer
Define the tokens that are gonna be used by the tokenizer

In [73]:
#Define a class that contains all the token that we are gonna need
class Tokens:
    COMMA = '<comma>'
    START = '<start>'
    END = '<end>'

In [74]:
# Define a vectorized function to add the token to the oriinal string inside the dataset
add_token_vect = np.vectorize(
    lambda x: f'{Tokens.START} ' + x.replace(',', f' {Tokens.COMMA}') + f' {Tokens.END}')

# Apply the function to the 'generic_sentence' column of the DataFrame
corpus = add_token_vect(ds['generic_sentence'])

Define a custom processing function to eliminate special characters

In [75]:
def custom_preprocessing(text):
    """
    this function is gonna remove every special character that is not an `<>,` from the original sentences.
    text - text to be processed
    """
    chars = string.punctuation
    chars = chars.replace(",", "")
    chars = chars.replace("<", "")
    chars = chars.replace(">", "")
    # Remove punctuation
    text = tf.strings.regex_replace(text, '[%s]' % re.escape(chars), '')
    # Lowercase
    text = tf.strings.lower(text)
    # Remove punctuation
    return text

Create the tokenizer, and use the processing function to tokenize the input

In [76]:
tokenizer = tf.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE,
    standardize=custom_preprocessing,
    output_sequence_length=SEQ_LEN,
    output_mode='int',
    pad_to_max_tokens=True,
)

#adapt the tokenizer to the text of the ds
tokenizer.adapt(corpus)

vocab = tokenizer.get_vocabulary()

#visualize the first 10 tokens of the tokenizer
print(vocab[:10])

['', '[UNK]', '<start>', '<end>', 'the', 'of', 'and', '<comma>', 'is', 'to']


Defining the detokenizer

In [77]:
class TextDetokenizer:
    def __init__(self, vectorize_layer):
        self.vectorize_layer = vectorize_layer
        vocab = self.vectorize_layer.get_vocabulary()
        self.index_to_word = {index: word for index, word in enumerate(vocab)}

    def __detokenize_tokens(self, tokens):
        def check_token(t):
            if t == 2:
                s = "<start>"
            elif t == 3:
                s = "<end>"
            elif t == 7:
                s = "<comma>"
            else:
                s = self.index_to_word.get(t, '[UNK]')
            return s

        return ' '.join([check_token(token) for token in tokens if token != 0])

    def __call__(self, batch_tokens):
        return [self.__detokenize_tokens(tokens) for tokens in batch_tokens]

#instantiate the detokenizer
detokenizer = TextDetokenizer(tokenizer)

#tokenize the content of the whole dataset (corpus)
sentences = tokenizer( corpus ).numpy()

In [78]:
mask = np.sum( (sentences==1), axis=1) >= 1
original_data = np.delete( sentences, mask , axis=0)
original_data.shape

(241194, 28)

### Create a Data Generator for the Transformer Model

For the transformer architecture, we need a generator to provide the appropriate inputs and targets. The generator will output a tuple containing the input data and the target variable:

- encoder_input: the scrambled input sentence.
- decoder_input: the previous known sequence, starting with the <start> token.
- decoder_output: the original sentence before scrambling, representing the expected output.

In [79]:
class DataGenerator(keras.utils.PyDataset):
    def __init__(self, data, batch_size=32, shuffle=True, seed=42, **kwargs):
        super().__init__(**kwargs)
        self.data = data
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.seed = seed
        self.indexes = np.arange(len(self.data))

    def __len__(self):
        return math.ceil(len(self.data) / self.batch_size)

    def __getitem__(self, index):
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        data_batch = np.array([self.data[k] for k in indexes])
        result = np.copy(data_batch)

        # shuffle the phrases inside the tags
        for i in range(data_batch.shape[0]):
            np.random.shuffle(data_batch[i, 1:data_batch[i].argmin() - 1])

        encoder_input = data_batch
        decoder_input = np.copy(result)
        decoder_output = np.copy(result)
        decoder_output = decoder_output[:, 1:]

        decoder_output = np.pad(decoder_output, [[0, 0], [0, 1]], mode='constant')

        return (encoder_input, decoder_input), decoder_output

Shuffle the data

In [99]:
# Make a random permutation of training and test set
np.random.seed(42)
# Shuffle the all data
shuffled_indices = np.random.permutation(len(original_data))
shuffled_data = original_data[shuffled_indices]

Create the train and test generators

In [81]:
#split the dataset
train_generator = DataGenerator(shuffled_data[:220000], batch_size=BATCH_SIZE)
test_generator = DataGenerator(shuffled_data[225000:], batch_size=BATCH_SIZE)


Exploring how the generator outputs look like

In [82]:
x, y = train_generator.__getitem__(1)

x_enc, x_dec = x

#detokenize
x_enc_inp_detok = detokenizer(x_enc)[0]
x_dec_inp_detok = detokenizer(x_dec)[0]
y_dec = detokenizer(y)[0]

# print the sentences
print("Encoder Input: ",(x_enc_inp_detok))
print("Dencoder Input: ",(x_dec_inp_detok))
print("Target: ", y_dec)

Encoder Input:  <start> cholesterol type a steroid is lipid that a of special called is <end>
Dencoder Input:  <start> cholesterol is a special type of lipid that is called a steroid <end>
Target:  cholesterol is a special type of lipid that is called a steroid <end>


# Metrics

Let s be the source string and p your prediction. The quality of the results will be measured according to the following metric:

1.  look for the longest substring w between s and p
2.  compute |w|/max(|s|,|p|)

If the match is exact, the score is 1.

When computing the score, you should NOT consider the start and end tokens.



The longest common substring can be computed with the SequenceMatcher function of difflib, that allows a simple definition of our metric.

In [83]:
from difflib import SequenceMatcher


def score(s, p):
    match = SequenceMatcher(None, s, p).find_longest_match()
    # print(match.size)
    return (match.size/max(len(p), len(s)))

Let's do an example.

In [84]:
original = "at first henry wanted to be friends with the king of france"
generated = "henry wanted to be friends with king of france at the first"

print("your score is ", score(original, generated))

your score is  0.5423728813559322


The score must be computed as an average of at least 3K random examples taken form the test set.

# What to deliver

You are supposed to deliver a single notebook, suitably commented.
The notebook should describe a single model, although you may briefly discuss additional attempts you did.

The notebook should contain a full trace of the training.
Weights should be made available on request.

You must also give a clear assesment of the performance of the model, computed with the metric that has been given to you.

# Good work!

# Proposed model: transformer

The model is composed of two general components:

- Encoder: Reads the input sequence (in this case, the scrambled words) and produces a fixed-dimensional vector representation.
- Decoder: Generates the output sequence (original sentence) from the representation provided by the Encoder.


## Why the transformer
The reason for choosing this arcitecture is its wide spread use in the industry applications, and its proven record of good results in NLP tasks. 



## Define the transformer model


### Defining helper functions

In [85]:
class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, vocab_size, embed_dim, **kwargs):
        super().__init__(**kwargs)
        self.token_embeddings = layers.Embedding(
            input_dim=vocab_size, output_dim=embed_dim
        )
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=embed_dim
        )
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim

    def call(self, inputs):
        length = ops.shape(inputs)[-1]
        positions = ops.arange(0, length, 1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

    def compute_mask(self, inputs, mask=None):
        if mask is None:
            return None
        else:
            return ops.not_equal(inputs, 0)

    def get_config(self):
        config = super().get_config()
        config.update(
            {
                "sequence_length": self.sequence_length,
                "vocab_size": self.vocab_size,
                "embed_dim": self.embed_dim,
            }
        )
        return config

### Defining the encoder:

In [86]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, dropout=0.1, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.dropout = dropout
        self.attention = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.dense_proj = keras.Sequential([
            layers.Dense(dense_dim, activation="relu"),
            layers.Dense(embed_dim),
        ])
        self.dropout_1 = layers.Dropout(dropout)
        self.dropout_2 = layers.Dropout(dropout)
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, mask=None):
        if mask is not None:
            padding_mask = ops.cast(mask[:, None, :], dtype="int32")
        else:
            padding_mask = None

        attention_output = self.attention(query=inputs, value=inputs, key=inputs, attention_mask=padding_mask)
        attention_output = self.dropout_1(attention_output)
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        proj_output = self.dropout_2(proj_output)
        return self.layernorm_2(proj_input + proj_output)

    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "dense_dim": self.dense_dim,
            "num_heads": self.num_heads,
            "dropout": self.dropout,
        })
        return config

### Defining the decoder:

In [87]:
class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, latent_dim, num_heads, dropout=0.1, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.latent_dim = latent_dim
        self.num_heads = num_heads
        self.dropout = dropout
        self.attention_1 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.attention_2 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.dense_proj = keras.Sequential([
            layers.Dense(latent_dim, activation="relu"),
            layers.Dense(embed_dim),
        ])
        self.dropout_1 = layers.Dropout(dropout)
        self.dropout_2 = layers.Dropout(dropout)
        self.dropout_3 = layers.Dropout(dropout)
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.layernorm_3 = layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, encoder_outputs, mask=None):
        causal_mask = self.get_causal_attention_mask(inputs)
        if mask is not None:
            padding_mask = ops.cast(mask[:, None, :], dtype="int32")
            padding_mask = ops.minimum(padding_mask, causal_mask)
        else:
            padding_mask = None

        attention_output_1 = self.attention_1(query=inputs, value=inputs, key=inputs, attention_mask=causal_mask)
        attention_output_1 = self.dropout_1(attention_output_1)
        out_1 = self.layernorm_1(inputs + attention_output_1)

        attention_output_2 = self.attention_2(query=out_1, value=encoder_outputs, key=encoder_outputs, attention_mask=padding_mask)
        attention_output_2 = self.dropout_2(attention_output_2)
        out_2 = self.layernorm_2(out_1 + attention_output_2)

        proj_output = self.dense_proj(out_2)
        proj_output = self.dropout_3(proj_output)
        return self.layernorm_3(out_2 + proj_output)

    def get_causal_attention_mask(self, inputs):
        input_shape = ops.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = ops.arange(sequence_length)[:, None]
        j = ops.arange(sequence_length)
        mask = ops.cast(i >= j, dtype="int32")
        mask = ops.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = ops.concatenate(
            [ops.expand_dims(batch_size, -1), ops.convert_to_tensor([1, 1])],
            axis=0,
        )
        return ops.tile(mask, mult)

    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "latent_dim": self.latent_dim,
            "num_heads": self.num_heads,
            "dropout": self.dropout,
        })
        return config

In [88]:
# Define a function that instantiates the transformer
def instantiate_transformer(seq_len, vocab_size, embedding_dim, latent_dim, num_heads, dropout_rate, num_layers):
    
    """
    Declares a sequence-to-sequence transformer, with multiple encoder and decoder layers.
    """
    
    # Encoder inputs
    enc_inputs = tf.keras.layers.Input(shape=(seq_len,), dtype="int64", name="encoder_inputs")
    
    # Embedding and positional encoding for encoder
    enc_embedding = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim)(enc_inputs)
    encoder_output = enc_embedding

    # Encoder layers
    encoder_layers = []
    for i in range(num_layers):
        encoder_layer = TransformerEncoder(embed_dim=embedding_dim, dense_dim=latent_dim, num_heads=num_heads, dropout=dropout_rate, name=f"encoder_{i}")
        encoder_output = encoder_layer(encoder_output)
        encoder_layers.append(encoder_layer)

    # Decoder inputs
    dec_inputs = tf.keras.layers.Input(shape=(seq_len,), dtype="int64", name="decoder_inputs")
    
    # Embedding and positional encoding for decoder
    dec_embedding = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim)(dec_inputs)
    decoder_output = dec_embedding

    # Decoder layers
    decoder_layers = []
    for i in range(num_layers):
        decoder_layer = TransformerDecoder(embed_dim=embedding_dim, latent_dim=latent_dim, num_heads=num_heads, dropout=dropout_rate, name=f"decoder_{i}")
        decoder_output = decoder_layer(decoder_output, encoder_output)
        decoder_layers.append(decoder_layer)

    # Output layer
    outputs = tf.keras.layers.Dense(vocab_size, activation="softmax")(decoder_output)

    transformer = tf.keras.Model(inputs=[enc_inputs, dec_inputs], outputs=outputs, name="transformer_model")
    transformer.summary()
    return transformer

# Training the model

To train the transformer model, we define custom loss and scheduling functions, configure the model with hyperparameters, and implement an early stopping mechanism to save the best model.

- Custom Loss Function: Computes the loss using SparseCategoricalCrossentropy and adds a penalty for incorrect word positions.
- Custom Accuracy Function: Measures the accuracy by counting correctly positioned words in the decoded sentence.
- Learning Rate Scheduler: The CustomSchedule custom learning rate scheduler is inspired by the "Attention is All You Need" paper, adjusting the learning rate during training. The optimizer also uses parameters from the same paper.

In [89]:
def loss_func(target, predictions):
    
    """
    Calculates the mean sparse categorical cross-entropy loss for each position in the target sequence. 
    Applies a mask to ignore invalid positions (zeros) in the target. 
    This ensures that the loss is computed only over valid (non-zero) positions.
    """
    
    valid_positions = tf.not_equal(target, 0)

    raw_loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')(target, predictions)

    valid_positions = tf.cast(valid_positions, dtype=raw_loss.dtype)

    masked_loss = raw_loss * valid_positions

    mean_loss = tf.reduce_sum(masked_loss) / tf.reduce_sum(valid_positions)
    
    return mean_loss

In [90]:
#define a decoding function
vocab = tokenizer.get_vocabulary()
spa_index_lookup = dict(zip(range(len(vocab)), vocab))
max_decoded_sentence_length = 27

def decode_sequence(input_sentence):
    
    """
    Generates a sequence of tokens from an input sentence using a trained model. 
    Predicts the next token iteratively and appends it to the decoded sentence.
    It stops when the end token is reached or the maximum sentence length is exceeded.
    """
    
    tokenized_input_sentence = tokenizer([input_sentence])
    decoded_sentence = Tokens.START
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = tokenizer([decoded_sentence])[:, :-1]
        predictions = transformer([tokenized_input_sentence, tokenized_target_sentence])

        sampled_token_index = ops.convert_to_numpy(
            ops.argmax(predictions[0, i, :])
        ).item(0)

        sampled_token = spa_index_lookup[sampled_token_index]
        decoded_sentence += " " + sampled_token

        if sampled_token == Tokens.END:
            break
    return decoded_sentence

In [91]:
def clean_sentence(sentence):
    """
    Eliminates the "start", "end" and "comma" tokens from a sentence.
    """
    clean_sentence = sentence.replace(Tokens.START, "").replace(Tokens.END, "").replace(Tokens.COMMA, ",").strip()
    return clean_sentence

In [92]:
from tqdm import tqdm


sample_indices = np.random.choice(np.arange(230000, 235000), size=3840, replace=False)
data_gen = DataGenerator(shuffled_data[sample_indices], batch_size=BATCH_SIZE)

def evaluate_model_val(transformer, vocab, batch_size, seq_len, score_func):
    
    """
    Evaluates a transformer model on a validation dataset generated by data_gen. 
    Iterates through batches, performs predictions, and computes a score for each predicted sequence.
    The metric used is the proposed metric by the requirements of the assignment.
    Returns the score for the current version of the model.
    
    INPUTS:
    
    transfomer - the trained model
    vocab - the vocabulary list obtained from the tokenizer
    batch_size - numper of samples per batch
    seq_len - maximum length of a sequence
    score_func - metric proposed by the assignment.
    
    OUTPUT:
    
    avg_score - the score computed over all the batches.
    
    """
    
    scores = []

    for i in tqdm(range(len(data_gen)), desc="Evaluating batches"):
        input_data, _ = data_gen[i]
        enc_input, dec_input = input_data

        if enc_input.shape[0] < batch_size:
            continue

        output_ta = tf.TensorArray(dtype=tf.int64, size=0, dynamic_size=True)
        start_token = np.array([vocab.index(Tokens.START)], ndmin=1)
        output_ta = output_ta.write(0, tf.tile(start_token, [batch_size]))

        for j in range(seq_len):
            current_output = tf.transpose(output_ta.stack())
            pred = transformer.predict([enc_input, current_output], verbose=0)

            last_prediction = pred[:, -1:, :]
            next_token = tf.argmax(last_prediction, axis=-1)
            output_ta = output_ta.write(j + 1, next_token[:, 0])

            if tf.reduce_all(tf.reduce_any(tf.equal(next_token, vocab.index(Tokens.END)), axis=-1)):
                break

        final_output = tf.transpose(output_ta.stack()).numpy()
        predicted_sequences = detokenizer(final_output)
        actual_sequences = detokenizer(dec_input)

        for predicted, true_sentence in zip(predicted_sequences, actual_sequences):
            cleaned_pred = clean_sentence(predicted)
            clean_true_sentence = clean_sentence(true_sentence)
            scores.append(score_func(cleaned_pred, clean_true_sentence))

    avg_score = np.mean(scores)

    print("Average score: ", avg_score)

    return avg_score


In [93]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    """
    Implements the learning rate schedule as described in the "Attention Is All You Need" paper. 
    """
    
    def __init__(self, emb_dim, warmup_steps=4000):
        super().__init__()
        self.emb_dim = emb_dim
        self.warmup_steps = warmup_steps
    
    def __call__(self, step):
        step = tf.cast(step, dtype=tf.float32)
        argument1 = step * tf.cast((self.warmup_steps ** -1.5), dtype=tf.float32)
        argument2 = tf.math.rsqrt(step)

        return tf.math.rsqrt(tf.cast(self.emb_dim, dtype=tf.float32)) * tf.math.minimum(argument1, argument2)

    def get_config(self):
        return {'emb_dim': self.emb_dim,'warmup_steps': self.warmup_steps}

## Training the model

In [95]:
#Hyperpameters and parameters of the model
EMBEDDING_DIM = 128
LATENT_DIM = 600
NUM_HEADS = 14
NUM_LAYERS = 5
DROPOUT = 0.15
EPOCHS = 35
PATIENCE = 5


#Initialize the transformer
transformer = instantiate_transformer(
    seq_len=SEQ_LEN, 
    vocab_size=VOCAB_SIZE, 
    embedding_dim=EMBEDDING_DIM, 
    latent_dim=LATENT_DIM, 
    num_heads=NUM_HEADS, 
    dropout_rate=DROPOUT, 
    num_layers=NUM_LAYERS
)


# Defining the optimizer and compiling the model
learning_rate = CustomSchedule(emb_dim=EMBEDDING_DIM, warmup_steps=4000)
optimizer = tf.keras.optimizers.AdamW(learning_rate=learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9, weight_decay = 0.005)
transformer.compile(optimizer=optimizer, loss=loss_func, metrics=["accuracy"])


# Training the model
# Contains an early stopping mechanism that stops the training is the score does not improve after a specified number of epochs (PATIENCE).
# The score is compared every 4 epochs. this was done to speed up the training time, as previous experiments determined the necessity of a high number of epoch for the model to converge.

best_transformer = transformer
best_score = -1
patience_counter = 0

for epoch in range(EPOCHS):
    print(f"Epoch: {epoch}")
    transformer.fit(train_generator)

    if epoch % 4 == 0 and epoch != 0:
        current_score = evaluate_model_val(transformer, vocab, BATCH_SIZE, SEQ_LEN, score)
        
        if current_score > best_score:
            best_transformer = transformer
            best_score = current_score
            patience_counter = 0
            transformer.save('model.weights.h5')
            print(f"Model Updated and Saved. Current score: {best_score}")
        else:
            patience_counter += 1
        
        if patience_counter > PATIENCE:
            print("Early stopping due to no improvement in score.")
            break
            
# Save the final best model
best_transformer.save('model.weights.h5')

Epoch: 0


W0000 00:00:1718168530.020025     171 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m275/860[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m5:36[0m 575ms/step - accuracy: 0.0232 - loss: 8.9185

W0000 00:00:1718168687.837173     171 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m860/860[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m467s[0m 435ms/step - accuracy: 0.0564 - loss: 7.9669
Epoch: 1
[1m860/860[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m318s[0m 369ms/step - accuracy: 0.2265 - loss: 3.7855
Epoch: 2
[1m860/860[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m318s[0m 370ms/step - accuracy: 0.3122 - loss: 1.9657
Epoch: 3
[1m860/860[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m318s[0m 370ms/step - accuracy: 0.3433 - loss: 1.2909
Epoch: 4
[1m860/860[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m318s[0m 369ms/step - accuracy: 0.3586 - loss: 1.0295


Evaluating batches:   0%|          | 0/15 [00:00<?, ?it/s]W0000 00:00:1718170180.097195     170 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update
W0000 00:00:1718170184.428724     173 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update
W0000 00:00:1718170187.018038     173 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update
W0000 00:00:1718170189.629851     173 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update
W0000 00:00:1718170192.282157     170 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update
W0000 00:00:1718170194.984850     170 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update
W0000 00:00:1718170197.880596     171 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update
W0000 00:00:1718170200.799736     171 graph_launch.cc:671] Fallback

Average score:  0.44471857626852523
Model Updated and Saved. Current score: 0.44471857626852523
Epoch: 5
[1m860/860[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m317s[0m 368ms/step - accuracy: 0.3724 - loss: 0.8531
Epoch: 6
[1m860/860[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m316s[0m 368ms/step - accuracy: 0.3859 - loss: 0.7029
Epoch: 7
[1m860/860[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m317s[0m 368ms/step - accuracy: 0.3959 - loss: 0.6013
Epoch: 8
[1m860/860[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m317s[0m 368ms/step - accuracy: 0.4048 - loss: 0.5185


Evaluating batches:  13%|█▎        | 2/15 [00:13<01:27,  6.74s/it]W0000 00:00:1718171629.104779     172 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update
W0000 00:00:1718171632.143426     170 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update
W0000 00:00:1718171635.714270     173 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update
Evaluating batches: 100%|██████████| 15/15 [01:51<00:00,  7.45s/it]


Average score:  0.4921824103470553
Model Updated and Saved. Current score: 0.4921824103470553
Epoch: 9
[1m860/860[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m316s[0m 368ms/step - accuracy: 0.4119 - loss: 0.4582
Epoch: 10
[1m860/860[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m316s[0m 368ms/step - accuracy: 0.4171 - loss: 0.4113
Epoch: 11
[1m860/860[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m316s[0m 367ms/step - accuracy: 0.4230 - loss: 0.3654
Epoch: 12
[1m860/860[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m316s[0m 368ms/step - accuracy: 0.4273 - loss: 0.3291


Evaluating batches: 100%|██████████| 15/15 [01:35<00:00,  6.40s/it]


Average score:  0.5153522297199582
Model Updated and Saved. Current score: 0.5153522297199582
Epoch: 13
[1m860/860[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m316s[0m 368ms/step - accuracy: 0.4319 - loss: 0.2991
Epoch: 14
[1m860/860[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m316s[0m 367ms/step - accuracy: 0.4349 - loss: 0.2722
Epoch: 15
[1m860/860[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m316s[0m 367ms/step - accuracy: 0.4379 - loss: 0.2515
Epoch: 16
[1m860/860[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m317s[0m 368ms/step - accuracy: 0.4408 - loss: 0.2299


Evaluating batches:  13%|█▎        | 2/15 [00:12<01:23,  6.46s/it]W0000 00:00:1718174369.893280     172 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update
W0000 00:00:1718174373.273662     171 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update
Evaluating batches: 100%|██████████| 15/15 [01:53<00:00,  7.59s/it]


Average score:  0.5278042561444067
Model Updated and Saved. Current score: 0.5278042561444067
Epoch: 17
[1m860/860[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m316s[0m 367ms/step - accuracy: 0.4434 - loss: 0.2123
Epoch: 18
[1m860/860[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m317s[0m 368ms/step - accuracy: 0.4459 - loss: 0.1974
Epoch: 19
[1m860/860[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m316s[0m 368ms/step - accuracy: 0.4479 - loss: 0.1830
Epoch: 20
[1m860/860[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m316s[0m 368ms/step - accuracy: 0.4495 - loss: 0.1681


Evaluating batches: 100%|██████████| 15/15 [01:41<00:00,  6.75s/it]


Average score:  0.5287415463729048
Model Updated and Saved. Current score: 0.5287415463729048
Epoch: 21
[1m860/860[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m316s[0m 368ms/step - accuracy: 0.4512 - loss: 0.1584
Epoch: 22
[1m860/860[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m316s[0m 367ms/step - accuracy: 0.4531 - loss: 0.1483
Epoch: 23
[1m860/860[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m315s[0m 367ms/step - accuracy: 0.4544 - loss: 0.1383
Epoch: 24
[1m860/860[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m315s[0m 366ms/step - accuracy: 0.4561 - loss: 0.1293


Evaluating batches: 100%|██████████| 15/15 [01:44<00:00,  6.98s/it]


Average score:  0.5319024272395863
Model Updated and Saved. Current score: 0.5319024272395863
Epoch: 25
[1m860/860[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m316s[0m 367ms/step - accuracy: 0.4568 - loss: 0.1231
Epoch: 26
[1m860/860[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m315s[0m 367ms/step - accuracy: 0.4579 - loss: 0.1161
Epoch: 27
[1m860/860[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m316s[0m 367ms/step - accuracy: 0.4587 - loss: 0.1135
Epoch: 28
[1m860/860[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m316s[0m 367ms/step - accuracy: 0.4599 - loss: 0.1046


Evaluating batches: 100%|██████████| 15/15 [01:37<00:00,  6.52s/it]


Average score:  0.5391739215830544
Model Updated and Saved. Current score: 0.5391739215830544
Epoch: 29
[1m152/860[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m4:22[0m 370ms/step - accuracy: 0.4617 - loss: 0.0915

KeyboardInterrupt: 

# Testing the model

The final model will be evaluated in this section.
The metric used is the metric proposed by the assignment.

We will use the evaluation function defined for the validation to test the model. In this instance, we will use a generator that iterates over the remaining data points(the unseen data in the test set).

In [104]:
from tqdm import tqdm

print(f"The test set has {shuffled_data.shape[0] - 235000} samples.")
data_gen = DataGenerator(shuffled_data[235000:], batch_size=BATCH_SIZE)
evaluate_model_val(best_transformer, vocab, BATCH_SIZE, SEQ_LEN, score)

The test set has 6194 samples


Evaluating batches: 100%|██████████| 25/25 [02:51<00:00,  6.85s/it]

Average score:  0.5376171068203628





0.5376171068203628

We can observe that the model has a performance of 53.7% on the metric provided by the assignment.

In the end, we will save the model weights, in order to be able to reproduce the test results if needed.

In [101]:
best_transformer.save('best_transformer.weights.h5')
transformer.save('transformer.weights.h5')