## Transformer

Based on (https://www.tensorflow.org/text/tutorials/transformer)

In [7]:
pip install "numpy<2.0.0"


Note: you may need to restart the kernel to use updated packages.


In [8]:
pip install tensorflow_text

Note: you may need to restart the kernel to use updated packages.


In [9]:
import logging
import time
import numpy as np
import matplotlib.pyplot as plt
import tensorflow_text
import tensorflow_datasets as tfds
import tensorflow as tf
logging.getLogger('tensorflow').setLevel(logging.ERROR) # suppress warnings

In [10]:
# Load the dataset 'ted_hrlr_translate/pt_to_en' using TensorFlow Datasets.
# This dataset contains translations from Portuguese to English, and we load it with metadata and in a supervised format.
examples, metadata = tfds.load('ted_hrlr_translate/pt_to_en', with_info=True, as_supervised=True)

# Split the loaded dataset into training and validation examples for the machine learning model.
# This helps in training the model on one set of data and validating its performance on another.
train_examples, val_examples = examples['train'], examples['validation']

# Define the model name that will be used for the translation task.
model_name = 'ted_hrlr_translate_pt_en_converter' # Define model_name here
model_path = 'ted_hrlr_translate_pt_en_converter_extracted/ted_hrlr_translate_pt_en_converter'

# Download the pre-trained model and tokenizer necessary for converting Portuguese text to English.
# The model is downloaded from TensorFlow's model repository and extracted for use.
tf.keras.utils.get_file(
    f'{model_name}.zip',  # The file name for the downloaded zip file.
    f'https://storage.googleapis.com/download.tensorflow.org/models/{model_name}.zip',  # The URL to download the model from.
    cache_dir='.',  # The directory to cache the download. '.' means the current directory.
    cache_subdir='',  # The subdirectory under the cache directory to store the download.
    extract=True  # Indicates that the downloaded zip file should be extracted.
)

# The tokenizer converts text into numbers so the model can process it.
tokenizers = tf.saved_model.load(model_path)

In [11]:
# EXTRA

# Take a sample from the training set to see the input and output format
for pt, en in train_examples.take(1):
    print("Portuguese sentence (pt):", pt.numpy().decode('utf-8'))
    print("English sentence (en):", en.numpy().decode('utf-8'))


# Encode the example to see the tokenized format
encoded_pt = tokenizers.pt.tokenize([pt])  # Add batch dimension
encoded_en = tokenizers.en.tokenize([en])

print("Tokenized Portuguese input shape:", encoded_pt.shape)
print("Tokenized English target shape:", encoded_en.shape)

# If you want to see the actual token ids
print("Portuguese tokens:", encoded_pt)
print("English tokens:", encoded_en)

Portuguese sentence (pt): e quando melhoramos a procura , tiramos a única vantagem da impressão , que é a serendipidade .
English sentence (en): and when you improve searchability , you actually take away the one advantage of print , which is serendipity .


2025-05-08 16:02:14.812184: W tensorflow/core/kernels/data/cache_dataset_ops.cc:858] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
2025-05-08 16:02:14.812756: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Tokenized Portuguese input shape: (1, None)
Tokenized English target shape: (1, None)
Portuguese tokens: <tf.RaggedTensor [[2, 44, 115, 6402, 148, 40, 887, 14, 3936, 40, 463, 2715, 94, 2047, 14,
  84, 44, 40, 117, 1328, 2721, 818, 539, 16, 3]]>
English tokens: <tf.RaggedTensor [[2, 72, 117, 79, 1259, 1491, 2362, 13, 79, 150, 184, 311, 71, 103, 2308,
  74, 2679, 13, 148, 80, 55, 4840, 1434, 2423, 540, 15, 3]]>


In [12]:
# Iterate through the dataset to get a batch of examples. Here only the first batch of size 3.
for pt_examples, en_examples in train_examples.batch(3).take(1):
    # Print the Portuguese examples for reference.
    print('> Examples in Portuguese:')
    print(pt_examples)

print("----------------------------------------------")

# Iterate through the English examples in the batch.
for en in en_examples.numpy():
    # Decode each English sentence from UTF-8 format and print.
    # This converts the byte format of text data into a string for readability.
    print(en.decode('utf-8'))

print("----------------------------------------------")

# Tokenize the English examples using the preloaded tokenizer.
# This converts the sentences into sequences of tokens that the model can understand.
encoded = tokenizers.en.tokenize(en_examples)

# Iterate through the tokenized data.
for row in encoded.to_list():
    # Print the tokenized representation of each sentence.
    # Each sentence is represented as a list of integers, where each integer corresponds to a specific token.
    print(row)

print("----------------------------------------------")

# Detokenize the previously tokenized sentences to convert them back into readable text.
round_trip = tokenizers.en.detokenize(encoded)

# Iterate through the detokenized sentences.
for line in round_trip.numpy():
    # Decode each sentence from UTF-8 format and print.
    # This step demonstrates that the tokenization process can be reversed, returning to the original text.
    print(line.decode('utf-8'))

> Examples in Portuguese:
tf.Tensor(
[b'e quando melhoramos a procura , tiramos a \xc3\xbanica vantagem da impress\xc3\xa3o , que \xc3\xa9 a serendipidade .'
 b'mas e se estes fatores fossem ativos ?'
 b'mas eles n\xc3\xa3o tinham a curiosidade de me testar .'], shape=(3,), dtype=string)
----------------------------------------------
and when you improve searchability , you actually take away the one advantage of print , which is serendipity .
but what if it were active ?
but they did n't test for curiosity .
----------------------------------------------


2025-05-08 16:02:15.460727: W tensorflow/core/kernels/data/cache_dataset_ops.cc:858] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
2025-05-08 16:02:15.462255: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


[2, 72, 117, 79, 1259, 1491, 2362, 13, 79, 150, 184, 311, 71, 103, 2308, 74, 2679, 13, 148, 80, 55, 4840, 1434, 2423, 540, 15, 3]
[2, 87, 90, 107, 76, 129, 1852, 30, 3]
[2, 87, 83, 149, 50, 9, 56, 664, 85, 2512, 15, 3]
----------------------------------------------
and when you improve searchability , you actually take away the one advantage of print , which is serendipity .
but what if it were active ?
but they did n ' t test for curiosity .


In [13]:
# Define a constant for the maximum number of tokens allowed in a sequence.
MAX_TOKENS = 128

def filter_max_tokens(pt, en):
    """
    Filter out pairs of sentences where the number of tokens in either the Portuguese or English text exceeds MAX_TOKENS.

    Parameters:
    - pt: Portuguese text samples.
    - en: English text samples.

    Returns:
    - A boolean indicating whether the sentence pair should be included based on the token count.
    """
    # Calculate the maximum number of tokens between the Portuguese and English samples in the batch.
    num_tokens = tf.maximum(tf.shape(pt)[1], tf.shape(en)[1])
    # Return True if the number of tokens is less than the maximum allowed, otherwise False.
    return num_tokens < MAX_TOKENS

def tokenize_pairs(pt, en):
    """
    Tokenize pairs of sentences (Portuguese and English), converting them from ragged tensors to dense tensors,
    padding with zeros where necessary to achieve uniform length.

    Parameters:
    - pt: Batch of Portuguese text samples.
    - en: Batch of English text samples.

    Returns:
    - Tuple of tokenized and padded Portuguese and English tensors.
    """
    # Tokenize the Portuguese sentences and convert to a dense tensor with padding.
    pt = tokenizers.pt.tokenize(pt).to_tensor()
    # Tokenize the English sentences and convert to a dense tensor with padding.
    en = tokenizers.en.tokenize(en).to_tensor()
    # Return the tokenized and padded text pairs.
    # adding zeros to make them same length
    return pt, en

# Define constants for dataset preparation.
BUFFER_SIZE = 20000  # Size of the buffer for shuffling the dataset.
BATCH_SIZE = 64  # Number of samples per batch.

def make_batches(ds):
    """
    Prepare batches of the dataset for training or validation.

    Parameters:
    - ds: The dataset to process, either training or validation.

    Returns:
    - The processed dataset in batches, ready for training or evaluation.
    """
    return (
        ds
        .cache()  # Cache the dataset to memory to improve performance.
        .shuffle(BUFFER_SIZE)  # Shuffle the dataset with a buffer size of BUFFER_SIZE.
        .batch(BATCH_SIZE)  # Combine consecutive elements of the dataset into batches.
        .map(tokenize_pairs, num_parallel_calls=tf.data.AUTOTUNE)  # Tokenize pairs of sentences in parallel.
        .filter(filter_max_tokens)  # Filter out batches where any sentence pair exceeds the maximum token limit.
        .prefetch(tf.data.AUTOTUNE)  # Prefetch batches for faster access.
    )

# Prepare the training and validation datasets by applying the make_batches function.
train_batches = make_batches(train_examples)
val_batches = make_batches(val_examples)

In [14]:
def get_angles(pos, i, d_model):
    """
    Computes the angle rates for the positional encoding.
    The formula ensures that each dimension of the embedding varies at a different wavelength (some change faster than others).

    Parameters:
    - pos: Position index.
    - i: Dimension index.
    - d_model: Depth of the model (number of dimensions).

    Returns:
    - The angle rates for positional encoding.
    """
    # Calculate the angles based on position and dimension index.
    # This formula helps in varying the wavelength across different dimensions.
    angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
    return pos * angle_rates

def positional_encoding(position, d_model):
    """
    Generates a positional encoding matrix.

    Parameters:
    - position: The maximum position index.
    - d_model: The depth of the model (number of dimensions).

    Returns:
    - A positional encoding matrix of shape (1, position, d_model).
    """
    # Generate angles based on positions and dimensions.
    angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                            np.arange(d_model)[np.newaxis, :],
                            d_model)
    # Apply sine to even indices in the angles array (2i).
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    # Apply cosine to odd indices in the angles array (2i+1).
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    # Expand the dimensions to fit the model requirements.
    pos_encoding = angle_rads[np.newaxis, ...]
    # Cast the encoding to TensorFlow float32 type.
    return tf.cast(pos_encoding, dtype=tf.float32)

def create_padding_mask(seq):
    """
    Creates a padding mask for sequences.
    This mask hides the padding tokens (i.e., zeros) so they don't affect the attention mechanism. It returns 1s where padding exists, and 0s elsewhere.
    Parameters:
    - seq: The sequence of tokens.

    Returns:
    - A padding mask for the sequence.
    """
    # Create a mask where every zero in the sequence is marked with a 1 (padding) and others with a 0.
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    # Add extra dimensions to the mask so it can be added to the attention logits.
    return seq[:, tf.newaxis, tf.newaxis, :]  # Shape: (batch_size, 1, 1, seq_len)

def create_look_ahead_mask(size):
    """
    Creates a look-ahead mask to mask future tokens in a sequence.
    Each token can only see previous ones (or itself), but not the next ones, ensuring proper autoregressive behavior.
    Parameters:
    - size: Size of the mask.

    Returns:
    - A look-ahead mask of shape (size, size).
    """
    # Create a mask where every entry that is in the lower triangle (including the diagonal) is 0, and everything else is 1.
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask  # Shape: (seq_len, seq_len)

In [15]:
#Attention
def scaled_dot_product_attention(q, k, v, mask=None):
    """
    Calculates the attention weights and applies them to the value vectors.

    Parameters:
    - q (query): Tensor with shape (..., seq_len_q, depth)
    - k (key): Tensor with shape (..., seq_len_k, depth)
    - v (value): Tensor with shape (..., seq_len_v, depth_v)
    - mask: (Optional) Float tensor with shape broadcastable to (..., seq_len_q, seq_len_k).

    Returns:
    - output: The result of applying attention weights to the value vectors.
    - attention_weights: The attention weights.
    """
    # Compute the dot product of the query and key tensors. Transpose the key tensor for proper alignment.
    # This gives us a similarity score between each query and key.
    matmul_qk = tf.matmul(q, k, transpose_b=True)  # Shape: (..., seq_len_q, seq_len_k)

    # Scale the dot product by the square root of the depth of the key tensor.
    # This helps in preventing the softmax function from having extremely small gradients.
    dk = tf.cast(tf.shape(k)[-1], tf.float32)  # Get the depth of the keys.
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

    # Apply the mask if provided. The mask is used to nullify the effect of padding or future information.
    if mask is not None:
        scaled_attention_logits += (mask * -1e9)  # Use a large negative number to mask.

    # Apply softmax to get the attention weights. The softmax is applied on the key sequence dimension.
    # It shows how much attention each word pays to others.
    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # Shape: (..., seq_len_q, seq_len_k)

    # Apply the attention weights to the value tensor to get the output.
    output = tf.matmul(attention_weights, v)  # Shape: (..., seq_len_q, depth_v)

    return output, attention_weights

In [16]:
#Attention spaces
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, *, d_model, num_heads):
        """
        Initialize the MultiHeadAttention layer.

        Parameters:
        - d_model: Dimensionality of the model's output space.
        - num_heads: Number of attention heads.
        """
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads  # Number of attention heads.
        self.d_model = d_model  # Dimensionality of the model's output space.

        # Ensure the model's dimension is divisible by the number of heads to evenly distribute dimensions to each head.
        assert d_model % self.num_heads == 0

        self.depth = d_model // self.num_heads  # Dimensionality per attention head.

        # Define dense layers for the queries, keys, and values.
        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)

        # Final dense layer.
        self.dense = tf.keras.layers.Dense(d_model)

    def split_heads(self, x, batch_size):
        """
        Split the last dimension into (num_heads, depth) and transpose the result.

        Parameters:
        - x: Input tensor.
        - batch_size: Size of the batch.

        Returns:
        - Tensor with shape (batch_size, num_heads, seq_len, depth).
        """
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, v, k, q, mask):
        """
        The logic for the multi-head attention layer's forward pass.

        Parameters:
        - v: Value tensor.
        - k: Key tensor.
        - q: Query tensor.
        - mask: Mask to be applied.

        Returns:
        - output: Output tensor.
        - attention_weights: Attention weights.
        """
        batch_size = tf.shape(q)[0]

        # Apply dense layers to queries, keys, and values.
        q = self.wq(q)  # (batch_size, seq_len, d_model)
        k = self.wk(k)  # (batch_size, seq_len, d_model)
        v = self.wv(v)  # (batch_size, seq_len, d_model)

        # Split the dense outputs into multiple heads and transpose.
        q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
        k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
        v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)

        # Perform scaled dot product attention.
        scaled_attention, attention_weights = scaled_dot_product_attention(q, k, v, mask)

        # Transpose and reshape the attention output to match the input's dimensionality.
        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)
        concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)

        # Apply the final dense layer.
        output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)

        return output, attention_weights

In [17]:
def point_wise_feed_forward_network(d_model, dff):
    """
    Creates a point-wise feed forward network. This consists of two dense layers with a ReLU activation
    in between, which is used within each transformer block.

    Parameters:
    - d_model: The dimensionality of the input and output of the transformer model.
    - dff: The dimensionality of the inner layer, typically much larger than d_model to allow
           the model to combine features in the data in a high-dimensional space before projecting
           back down to d_model dimensions.

    Returns:
    - A tf.keras.Sequential model representing the feed forward network.
    """
    return tf.keras.Sequential([
        # First dense layer with dff units and ReLU activation. This expands the dimensionality to dff,
        # allowing the network to learn more complex features.
        tf.keras.layers.Dense(dff, activation='relu'),  # Output shape: (batch_size, seq_len, dff)

        # Second dense layer that projects the outputs back down to d_model dimensions.
        tf.keras.layers.Dense(d_model)  # Output shape: (batch_size, seq_len, d_model)
    ])

The EncoderLayer **combines**:

* Multi-Head Self-Attention

* Feed Forward Network

* Layer Normalization

* Dropout

* Residual Connections

Each EncoderLayer **learns** to:

* Focus on different parts of the input through self-attention.

* Extract features and relationships (such as distances) between words.

* Stabilize training using layer normalization and residual connections.*κείμενο σε πλάγια γραφή*

In [18]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, *, d_model, num_heads, dff, rate=0.1):
        """
        Initializes the EncoderLayer with multi-head attention, point-wise feed-forward network,
        dropout, and layer normalization components.

        Parameters:
        - d_model: Dimensionality of the model's output space.
        - num_heads: Number of attention heads.
        - dff: Dimensionality of the feed-forward network's inner layer.
        - rate: Dropout rate.
        """
        super(EncoderLayer, self).__init__()
        self.mha = MultiHeadAttention(d_model=d_model, num_heads=num_heads)  # Multi-head attention layer.
        self.ffn = point_wise_feed_forward_network(d_model, dff)  # Point-wise feed-forward network.

        # Layer normalization (first instance).
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        # Layer normalization (second instance).
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        # Dropout (first instance).
        self.dropout1 = tf.keras.layers.Dropout(rate)
        # Dropout (second instance).
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, x, training, mask):
        """
        The logic for one pass of the encoder layer.

        Parameters:
        - x: Input tensor.
        - training: Boolean indicating if the layer should behave in training mode (applying dropout) or in inference mode.
        - mask: Mask to be applied on the multi-head attention layer.

        Returns:
        - The output tensor of the encoder layer.
        """
        # Apply multi-head attention to the input (self attention).
        attn_output, _ = self.mha(x, x, x, mask)  # Output shape: (batch_size, input_seq_len, d_model)
        attn_output = self.dropout1(attn_output, training=training)  # Apply dropout to the attention output.

        # Add & normalize.
        out1 = self.layernorm1(x + attn_output)  # Residual connection followed by layer normalization.

        # Apply the feed-forward network to the normalized attention output.
        ffn_output = self.ffn(out1)  # Output shape: (batch_size, input_seq_len, d_model)
        ffn_output = self.dropout2(ffn_output, training=training)  # Apply dropout to the feed-forward network output.

        # Final add & normalize step.
        out2 = self.layernorm2(out1 + ffn_output)  # Residual connection followed by another layer normalization.

        return out2

Each DecoderLayer **learns** to:
* Attend to previous tokens in the target sequence using self-attention, while respecting autoregressive ordering with a look-ahead mask.

* Focus on relevant parts of the input (encoder output) via encoder-decoder attention, aligning source and target information.

* Transform the combined context with a feed-forward network, learning richer representations.

* Stabilize training using dropout, layer normalization, and residual connections throughout.

In [19]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, *, d_model, num_heads, dff, rate=0.1):
        """
        Initializes the DecoderLayer with two multi-head attention mechanisms, a point-wise feed-forward network,
        dropout, and layer normalization components.

        Parameters:
        - d_model: Dimensionality of the model's output space.
        - num_heads: Number of attention heads.
        - dff: Dimensionality of the feed-forward network's inner layer.
        - rate: Dropout rate.
        """
        super(DecoderLayer, self).__init__()
        # First multi-head attention layer (self-attention).
        self.mha1 = MultiHeadAttention(d_model=d_model, num_heads=num_heads)

        # Second multi-head attention layer (encoder-decoder attention).
        self.mha2 = MultiHeadAttention(d_model=d_model, num_heads=num_heads)

        # Point-wise feed-forward network.
        self.ffn = point_wise_feed_forward_network(d_model, dff)

        # Layer normalization (first instance).
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        # Layer normalization (second instance).
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        # Layer normalization (third instance).
        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        # Dropout (first instance).
        self.dropout1 = tf.keras.layers.Dropout(rate)
        # Dropout (second instance).
        self.dropout2 = tf.keras.layers.Dropout(rate)
        # Dropout (third instance).
        self.dropout3 = tf.keras.layers.Dropout(rate)

    def call(self, x, enc_output, *, training, look_ahead_mask, padding_mask):
        """
        The logic for one pass of the decoder layer.

        Parameters:
        - x: Input tensor (target sequence).
        - enc_output: Output tensor from the encoder.
        - training: Boolean indicating if the layer should behave in training mode or inference mode.
        - look_ahead_mask: Mask to prevent the decoder from 'seeing' future tokens.
        - padding_mask: Mask to exclude the attention on padding tokens.

        Returns:
        - The output tensor of the decoder layer.
        - attn_weights_block1: Attention weights from the first multi-head attention layer.
        - attn_weights_block2: Attention weights from the second multi-head attention layer.
        """
        # Apply the first multi-head attention layer (self attention).
        attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)  # (batch_size, target_seq_len, d_model)
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.layernorm1(attn1 + x)  # Apply residual connection and layer normalization.

        # Apply the second multi-head attention layer (encoder-decoder attention).
        attn2, attn_weights_block2 = self.mha2(enc_output, enc_output, out1, padding_mask)  # (batch_size, target_seq_len, d_model)
        attn2 = self.dropout2(attn2, training=training)
        out2 = self.layernorm2(attn2 + out1)  # Apply residual connection and layer normalization.

        # Apply the feed-forward network.
        ffn_output = self.ffn(out2)  # (batch_size, target_seq_len, d_model)
        ffn_output = self.dropout3(ffn_output, training=training)
        out3 = self.layernorm3(ffn_output + out2)  # Apply residual connection and layer normalization.

        return out3, attn_weights_block1, attn_weights_block2

In [20]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, *, num_layers, d_model, num_heads, dff, input_vocab_size, rate=0.1):
        """
        Initializes the Encoder part of the Transformer.

        Parameters:
        - num_layers: Number of encoder layers.
        - d_model: Dimensionality of the model's output space.
        - num_heads: Number of attention heads.
        - dff: Dimensionality of the feed-forward network's inner layer.
        - input_vocab_size: Size of the input vocabulary.
        - rate: Dropout rate.
        """
        super(Encoder, self).__init__()
        self.d_model = d_model
        self.num_layers = num_layers

        # Embedding layer for the input tokens.
        self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)

        # Positional encoding up to MAX_TOKENS.
        self.pos_encoding = positional_encoding(MAX_TOKENS, self.d_model)

        # Encoder layers
        self.enc_layers = [EncoderLayer(d_model=d_model, num_heads=num_heads, dff=dff, rate=rate) for _ in range(num_layers)]

        # Dropout layer.
        self.dropout = tf.keras.layers.Dropout(rate)

    def call(self, x, training, mask):
        seq_len = tf.shape(x)[1]

        # Adding embedding and position encoding.
        x = self.embedding(x)  # (batch_size, input_seq_len, d_model)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))  # Scale embeddings.
        x += self.pos_encoding[:, :seq_len, :]  # Add position encoding.
        x = self.dropout(x, training=training)  # Apply dropout.

        # Pass the input through each encoder layer.
        for i in range(self.num_layers):
            x = self.enc_layers[i](x, training=training, mask=mask)


        return x  # (batch_size, input_seq_len, d_model)

In [21]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, *, num_layers, d_model, num_heads, dff, target_vocab_size, rate=0.1):
        """
        Initializes the Decoder part of the Transformer.

        Parameters:
        - num_layers: Number of decoder layers.
        - d_model: Dimensionality of the model's output space.
        - num_heads: Number of attention heads.
        - dff: Dimensionality of the feed-forward network's inner layer.
        - target_vocab_size: Size of the target vocabulary.
        - rate: Dropout rate.
        """
        super(Decoder, self).__init__()
        self.d_model = d_model
        self.num_layers = num_layers

        # Embedding layer for the input tokens.
        self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)

        # Positional encoding up to MAX_TOKENS.
        self.pos_encoding = positional_encoding(MAX_TOKENS, self.d_model)

        # Decoder layers
        self.dec_layers = [DecoderLayer(d_model=d_model, num_heads=num_heads, dff=dff, rate=rate) for _ in range(num_layers)]

        # Dropout layer.
        self.dropout = tf.keras.layers.Dropout(rate)

    def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
        seq_len = tf.shape(x)[1]
        attention_weights = {}

        # Adding embedding and position encoding.
        x = self.embedding(x)  # (batch_size, target_seq_len, d_model)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))  # Scale embeddings.
        x += self.pos_encoding[:, :seq_len, :]  # Add position encoding.
        x = self.dropout(x, training=training)  # Apply dropout.

        # Pass the input through each decoder layer.
        for i in range(self.num_layers):
            x, block1, block2 = self.dec_layers[i](
                x,
                enc_output,
                training=training,
                look_ahead_mask=look_ahead_mask,
                padding_mask=padding_mask)
            attention_weights[f'decoder_layer{i+1}_block1'] = block1
            attention_weights[f'decoder_layer{i+1}_block2'] = block2

        return x, attention_weights  # (batch_size, target_seq_len, d_model), attention weights

In [22]:
class Transformer(tf.keras.Model):
    def __init__(self, *, num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, rate=0.1):
        """
        Initializes the Transformer model.

        Parameters:
        - num_layers: Number of layers in both the encoder and decoder.
        - d_model: Dimensionality of the model's output space.
        - num_heads: Number of attention heads.
        - dff: Dimensionality of the feed-forward network's inner layer.
        - input_vocab_size: Size of the input vocabulary.
        - target_vocab_size: Size of the target vocabulary.
        - rate: Dropout rate.
        """
        super().__init__()
        self.encoder = Encoder(num_layers=num_layers, d_model=d_model, num_heads=num_heads, dff=dff, input_vocab_size=input_vocab_size, rate=rate)
        self.decoder = Decoder(num_layers=num_layers, d_model=d_model, num_heads=num_heads, dff=dff, target_vocab_size=target_vocab_size, rate=rate)

        # Final linear layer that projects the decoder's output to the target vocabulary size.
        self.final_layer = tf.keras.layers.Dense(target_vocab_size)

    def call(self, inputs, training):
        """
        The logic for one forward pass through the model.

        Parameters:
        - inputs: A tuple of input tensor and target tensor.
        - training: Boolean indicating if the layer should behave in training mode or inference mode.

        Returns:
        - final_output: The model's output predictions.
        - attention_weights: The attention weights from the decoder.
        """
        inp, tar = inputs

        # Create masks for padding and future tokens.
        enc_padding_mask, look_ahead_mask, dec_padding_mask = self.create_masks(inp, tar)

        # Pass the input through the encoder.
        #enc_output = self.encoder(inp, training, enc_padding_mask)  # (batch_size, inp_seq_len, d_model)
        #self.encoder(inp, training=training, mask=enc_padding_mask)
        enc_output = self.encoder(inp, training=training, mask=enc_padding_mask)



        # Pass the encoder output and target through the decoder.
        #dec_output, attention_weights = self.decoder(tar, enc_output, training, look_ahead_mask, dec_padding_mask)
        #self.decoder(tar, enc_output, training=training,
         #    look_ahead_mask=look_ahead_mask,
          #   padding_mask=dec_padding_mask)

        dec_output, attention_weights = self.decoder(
              tar,
              enc_output,
              training=training,
              look_ahead_mask=look_ahead_mask,
              padding_mask=dec_padding_mask)


        # Pass the decoder output through the final linear layer.
        final_output = self.final_layer(dec_output)  # (batch_size, tar_seq_len, target_vocab_size)

        return final_output, attention_weights

    def create_masks(self, inp, tar):
        """
        Creates masks for padding and look ahead for the encoder and decoder.

        Parameters:
        - inp: Input tensor.
        - tar: Target tensor.

        Returns:
        - enc_padding_mask: Padding mask for the encoder.
        - look_ahead_mask: Look-ahead mask for the decoder.
        - dec_padding_mask: Padding mask for the decoder to mask the encoder outputs.
        """
        # Encoder padding mask.
        enc_padding_mask = create_padding_mask(inp)

        # Decoder padding mask for the second attention block (to mask encoder outputs).
        dec_padding_mask = create_padding_mask(inp)

        # Look-ahead mask (to mask future tokens) and decoder target padding mask combined.
        look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
        dec_target_padding_mask = create_padding_mask(tar)
        look_ahead_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)

        return enc_padding_mask, look_ahead_mask, dec_padding_mask

In [23]:
# -------------------------------
# Transformer Model Hyperparameters
# -------------------------------

num_layers = 4 # number of encoder/decoder layers in the Transformer
d_model = 128 # size of the embedding vector for each word
dff = 512 # size of the hidden layer inside the Feed Forward Neural Network
num_heads = 8 # number of attention heads (must divide d_model evenly)
dropout_rate = 0.1 # dropout rate for regularization

class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()
        self.d_model = tf.cast(d_model, tf.float32)  # Model dimensionality, cast to float32 for calculation.
        self.warmup_steps = warmup_steps  # Number of steps to linearly increase the learning rate.

    def __call__(self, step):
        step = tf.cast(step, tf.float32)
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)  # Linearly increase then decrease based on warmup steps.
        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)  # Calculate the learning rate.
        # Final learning rate: (1 / sqrt(d_model)) * min(arg1, arg2)

# Instantiate the learning rate schedule and Adam optimizer
learning_rate = CustomSchedule(d_model)
optimizer = tf.keras.optimizers.Adam(
    learning_rate,
    beta_1=0.9,         # First moment decay (default)
    beta_2=0.98,        # Second moment decay (used in the original Transformer paper)
    epsilon=1e-9        # Small value to avoid division by zero
)

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))  # Create a mask for non-zero tokens.
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
    loss_ = loss_object(real, pred)  # Calculate loss using some loss object not defined here.
    mask = tf.cast(mask, dtype=loss_.dtype)  # Cast mask to the same type as the loss.
    loss_ *= mask  # Apply mask to the loss.
    return tf.reduce_sum(loss_) / tf.reduce_sum(mask)  # Calculate the average loss.

def accuracy_function(real, pred):
    accuracies = tf.equal(real, tf.argmax(pred, axis=2))  # Check if real values match predictions.
    mask = tf.math.logical_not(tf.math.equal(real, 0))  # Create a mask for non-zero tokens.
    accuracies = tf.math.logical_and(mask, accuracies)  # Apply mask to accuracies.
    accuracies = tf.cast(accuracies, dtype=tf.float32)  # Cast to float32 for calculation.
    mask = tf.cast(mask, dtype=tf.float32)  # Cast mask to float32.
    return tf.reduce_sum(accuracies) / tf.reduce_sum(mask)  # Calculate the average accuracy.


train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.Mean(name='train_accuracy')

In [24]:
# -------------------------------
# Instantiate the Transformer model
# -------------------------------
transformer = Transformer(
    num_layers=num_layers,                        # Number of encoder and decoder layers
    d_model=d_model,                              # Embedding size / model dimensionality
    num_heads=num_heads,                          # Number of attention heads
    dff=dff,                                      # Hidden layer size in feed-forward network
    input_vocab_size=tokenizers.pt.get_vocab_size().numpy(),    # Portuguese vocabulary size
    target_vocab_size=tokenizers.en.get_vocab_size().numpy(),   # English vocabulary size
    rate=dropout_rate                             # Dropout rate
)

# -------------------------------
# Checkpointing: Saving and restoring model state
# -------------------------------

checkpoint_path = './checkpoints/train'   # Directory to save training checkpoints

# Create a checkpoint object that tracks the transformer and optimizer state
ckpt = tf.train.Checkpoint(transformer=transformer, optimizer=optimizer)

# Manage multiple checkpoints (e.g., keep the 5 latest ones)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

# If a previous checkpoint exists, restore the model and optimizer state
if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print('Latest checkpoint restored!!')

# -------------------------------
# Define input signature for tf.function (for performance optimization)
# Used to decorate the training step function later
# -------------------------------
train_step_signature = [
    tf.TensorSpec(shape=(None, None), dtype=tf.int64),  # Input sequence shape: (batch_size, input_seq_len)
    tf.TensorSpec(shape=(None, None), dtype=tf.int64),  # Target sequence shape: (batch_size, target_seq_len)
]


In [25]:
@tf.function(input_signature=train_step_signature)
def train_step(inp, tar):
    """
    Performs a single training step for the Transformer model.

    Args:
    inp: Input sequence batch (Portuguese).
    tar: Target sequence batch (English).

    The target sequence is split into decoder inputs (`tar_inp`) and the real outputs (`tar_real`).
    The model makes predictions for `tar_inp` and is trained against `tar_real`.
    """
    # Prepare target inputs and outputs
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]

    with tf.GradientTape() as tape:
        predictions, _ = transformer([inp, tar_inp], training=True)
        loss = loss_function(tar_real, predictions)

    gradients = tape.gradient(loss, transformer.trainable_variables)
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

    # Update the training loss and accuracy metrics
    train_loss(loss)
    train_accuracy(accuracy_function(tar_real, predictions))

EPOCHS = 20
for epoch in range(EPOCHS):
    start = time.time()

    # Reset the metrics at the start of the next epoch
    # Instead of using train_loss.reset_states(), you can re-initialize the Mean object for each epoch
    # Replace:
    # train_loss.reset_states()
    # train_accuracy.reset_states()

    # With:
    train_loss = tf.keras.metrics.Mean(name='train_loss')
    train_accuracy = tf.keras.metrics.Mean(name='train_accuracy')

    # Training loop - iterate over the training set
    for (batch, (inp, tar)) in enumerate(train_batches):
        train_step(inp, tar)

        if batch % 50 == 0:
            print(f'Epoch {epoch + 1} Batch {batch} Loss {train_loss.result():.4f} Accuracy {train_accuracy.result():.4f}')

    # Save the model every 5 epochs
    if (epoch + 1) % 3 == 0:
        ckpt_save_path = ckpt_manager.save()
        print(f'Saving checkpoint for epoch {epoch+1} at {ckpt_save_path}')

    print(f'Epoch {epoch + 1} Loss {train_loss.result():.4f} Accuracy {train_accuracy.result():.4f}')
    print(f'Time taken for 1 epoch: {time.time() - start:.2f} secs\n')

Epoch 1 Batch 0 Loss 8.9166 Accuracy 0.0000
Epoch 1 Batch 50 Loss 8.8400 Accuracy 0.0014
Epoch 1 Batch 100 Loss 8.7198 Accuracy 0.0225
Epoch 1 Batch 150 Loss 8.5968 Accuracy 0.0363
Epoch 1 Batch 200 Loss 8.4520 Accuracy 0.0442
Epoch 1 Batch 250 Loss 8.2781 Accuracy 0.0528
Epoch 1 Batch 300 Loss 8.0856 Accuracy 0.0627
Epoch 1 Batch 350 Loss 7.8868 Accuracy 0.0711
Epoch 1 Batch 400 Loss 7.6966 Accuracy 0.0781
Epoch 1 Batch 450 Loss 7.5284 Accuracy 0.0844
Epoch 1 Batch 500 Loss 7.3794 Accuracy 0.0903
Epoch 1 Batch 550 Loss 7.2429 Accuracy 0.0964
Epoch 1 Batch 600 Loss 7.1193 Accuracy 0.1031
Epoch 1 Batch 650 Loss 7.0035 Accuracy 0.1096
Epoch 1 Batch 700 Loss 6.8944 Accuracy 0.1156


2025-05-08 16:32:56.850818: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Epoch 1 Loss 6.8927 Accuracy 0.1156
Time taken for 1 epoch: 1840.09 secs

Epoch 2 Batch 0 Loss 0.0000 Accuracy 0.0000
Epoch 2 Batch 50 Loss 0.0000 Accuracy 0.0000
Epoch 2 Batch 100 Loss 0.0000 Accuracy 0.0000
Epoch 2 Batch 150 Loss 0.0000 Accuracy 0.0000
Epoch 2 Batch 200 Loss 0.0000 Accuracy 0.0000
Epoch 2 Batch 250 Loss 0.0000 Accuracy 0.0000


KeyboardInterrupt: 

In [None]:
class Translator(tf.Module):
    def __init__(self, tokenizers, transformer):
        """
        Initializes the Translator module.

        Parameters:
        - tokenizers: A collection containing source (Portuguese) and target (English) tokenizers.
        - transformer: The trained Transformer model for translation.
        """
        self.tokenizers = tokenizers
        self.transformer = transformer

    def __call__(self, sentence, max_length=128):
        """
        Translates a Portuguese sentence to English.

        Parameters:
        - sentence: The input sentence, a tf.Tensor of Portuguese text.
        - max_length: The maximum length of the translated sequence.

        Returns:
        - text: The translated sentence.
        - tokens: The tokenized representation of the translated sentence.
        - attention_weights: The attention weights from the Transformer model.
        """
        assert isinstance(sentence, tf.Tensor), "Input must be a tf.Tensor"
        if len(sentence.shape) == 0:
            sentence = sentence[tf.newaxis]

        # Tokenize the input sentence.
        sentence = self.tokenizers.pt.tokenize(sentence).to_tensor()
        encoder_input = sentence

        # Initialize the output with the English start token.
        start_end = self.tokenizers.en.tokenize([''])[0]
        start, end = start_end[0][tf.newaxis], start_end[1][tf.newaxis]

        output_array = tf.TensorArray(dtype=tf.int64, size=0, dynamic_size=True)
        output_array = output_array.write(0, start)

        for i in tf.range(max_length):
            output = tf.transpose(output_array.stack())
            predictions, _ = self.transformer([encoder_input, output], training=False)

            # Select the last token from the predictions.
            predictions = predictions[:, -1:, :]
            predicted_id = tf.argmax(predictions, axis=-1)

            # Concatenate the predicted_id to the output.
            output_array = output_array.write(i+1, predicted_id[0])

            if tf.reduce_all(predicted_id == end):
                break

        output = tf.transpose(output_array.stack())

        # Convert the output tokens to text and lookup the corresponding words.
        text = self.tokenizers.en.detokenize(output)[0]  # Convert tokens to text.
        tokens = self.tokenizers.en.lookup(output)[0]    # Convert tokens to words.

        # Recalculate attention weights with the final output for visualization.
        _, attention_weights = self.transformer([encoder_input, output[:, :-1]], training=False)

        return text, tokens, attention_weights


In [None]:
# Initialize the translator module with the necessary tokenizers and transformer model.
translator = Translator(tokenizers, transformer)

def print_translation(sentence, translated_text, ground_truth):
    """
    Prints the original sentence, the model's translation, and the ground truth translation.

    Args:
    - sentence: The original sentence (in Portuguese).
    - translated_text: The translated sentence (in English) produced by the model.
    - ground_truth: The actual translation of the sentence (ground truth).
    """
    print(f'{"Input:":15s}: {sentence}')
    print(f'{"Prediction":15s}: {translated_text.numpy().decode("utf-8")}')
    print(f'{"Ground truth":15s}: {ground_truth}')

# The Portuguese sentence to be translated.
sentence = 'os meus vizinhos ouviram sobre esta ideia.'

# The ground truth English translation of the Portuguese sentence.
ground_truth = 'and my neighboring homes heard about this idea .'

# Perform the translation using the `translator` instance.
translated_text, translated_tokens, attention_weights = translator(tf.constant(sentence))

# Print the results.
print_translation(sentence, translated_text, ground_truth)

Input:         : os meus vizinhos ouviram sobre esta ideia.
Prediction     : robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic robotic rob