## Imports

In [1]:
import numpy as np
import kagglehub
import os
import pandas as pd
from pathlib import Path
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input, Concatenate
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

https://medium.com/@funcry/in-depth-understanding-of-attention-mechanism-part-ii-scaled-dot-product-attention-and-its-7743804e610e

In [2]:
def softmax(x):
    x_max = np.max(x, axis=-1, keepdims=True)
    e_x = np.exp(x - x_max)
    return e_x / np.sum(e_x, axis=-1, keepdims=True)

# Numpy implementation
def scaled_dot_product_attention(Q, K, V):
    d = Q.shape[-1]
    scores = np.dot(Q, K.T) / np.sqrt(d)
    attention_weights = softmax(scores)
    output = np.dot(attention_weights, V)
    return output, attention_weights


# Part 2

Dataset: https://www.kaggle.com/datasets/shahadhamza/multi30k-dataset


*   train.en
  *   English sentences to train model on
*   train.fr
  *   French sentences to train model on
*   val.en
  *   English sentences for validation
*   val.fr
  *   French sentences for validation



In [3]:
# Implementaiton of scaled dot product attention that works with tensorflow
class ScaledDotProductAttention(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(ScaledDotProductAttention, self).__init__(**kwargs)

    def call(self, inputs):
        Q, K, V = inputs
        d_k = tf.cast(tf.shape(Q)[-1], tf.float32)
        scores = tf.matmul(Q, K, transpose_b=True) / tf.math.sqrt(d_k)
        weights = tf.nn.softmax(scores, axis=-1)
        output = tf.matmul(weights, V)
        return output

path = kagglehub.dataset_download("shahadhamza/multi30k-dataset")
train_en = Path(os.path.join(path, "train.en")).read_text(encoding="utf-8").splitlines()
train_fr = Path(os.path.join(path, "train.fr")).read_text(encoding="utf-8").splitlines()
val_en = Path(os.path.join(path, "val.en")).read_text(encoding="utf-8").splitlines()
val_fr = Path(os.path.join(path, "val.fr")).read_text(encoding="utf-8").splitlines()

train_df = pd.DataFrame({"en": train_en, "fr": train_fr}).iloc[:10000]
val_df = pd.DataFrame({"en": val_en, "fr": val_fr}).iloc[:1000]

vocab_size_en = 10000
vocab_size_fr = 10000
embedding_dim = 256
units = 512
max_len = 40

text_vectorizer_en = TextVectorization(output_sequence_length=max_len, max_tokens=vocab_size_en)
text_vectorizer_fr = TextVectorization(output_sequence_length=max_len, max_tokens=vocab_size_fr)

text_vectorizer_en.adapt(train_df["en"])
text_vectorizer_fr.adapt(train_df["fr"])

X_train = text_vectorizer_en(np.array(train_df["en"]))
y_train = text_vectorizer_fr(np.array(train_df["fr"]))

# Encoder
encoder_inputs = Input(shape=(None,), name='encoder_inputs')
enc_emb = Embedding(vocab_size_en, embedding_dim, name='encoder_embedding')(encoder_inputs)
encoder_outputs, state_h, state_c = LSTM(units, return_sequences=True, return_state=True, name='encoder_lstm')(enc_emb)

# Decoder
decoder_inputs = Input(shape=(None,), name='decoder_inputs')
dec_emb = Embedding(vocab_size_fr, embedding_dim, name='decoder_embedding')(decoder_inputs)
decoder_lstm = LSTM(units, return_sequences=True, return_state=True, name='decoder_lstm')
decoder_outputs = decoder_lstm(dec_emb, initial_state=[state_h, state_c])[0]

# Scaled Dot-Product Attention
attention = ScaledDotProductAttention(name='attention')
context = attention([decoder_outputs, encoder_outputs, encoder_outputs])

# Concatenate decoder output and context vector
combined = Concatenate(name='concatenate')([decoder_outputs, context])
output = Dense(vocab_size_fr, activation="softmax", name='output_dense')(combined)

model = Model([encoder_inputs, decoder_inputs], output)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
model.summary()


Downloading from https://www.kaggle.com/api/v1/datasets/download/shahadhamza/multi30k-dataset?dataset_version_number=1...


100%|██████████| 1.21M/1.21M [00:00<00:00, 77.6MB/s]

Extracting files...





# Part 3

In [4]:
decoder_target = y_train[:, 1:]
decoder_input = y_train[:, :-1]

# Convert tensors to NumPy arrays for train_test_split
X_train_np = X_train.numpy()
decoder_input_np = decoder_input.numpy()
decoder_target_np = decoder_target.numpy()

X_en_train, X_en_val, X_fr_in_train, X_fr_in_val, X_fr_out_train, X_fr_out_val = train_test_split(
    X_train_np, decoder_input_np, decoder_target_np, test_size=0.1, random_state=42)

history = model.fit(
    [X_en_train, X_fr_in_train],
    X_fr_out_train,
    validation_data=([X_en_val, X_fr_in_val], X_fr_out_val),
    batch_size=64,
    epochs=10,
    verbose=1
)

def create_inference_models(trained_model):
    # Encoder model
    encoder_model = Model(trained_model.input[0],
                         [trained_model.get_layer('encoder_lstm').output[0],
                          trained_model.get_layer('encoder_lstm').output[1],
                          trained_model.get_layer('encoder_lstm').output[2]])

    # Decoder model inputs
    decoder_inputs = Input(shape=(None,), name='decoder_inputs_inf')
    decoder_state_input_h = Input(shape=(units,), name='decoder_state_h')
    decoder_state_input_c = Input(shape=(units,), name='decoder_state_c')
    encoder_outputs_input = Input(shape=(None, units), name='encoder_outputs_inf')

    # Decoder layers
    dec_emb_inf = trained_model.get_layer('decoder_embedding')(decoder_inputs)
    decoder_outputs_inf, state_h_inf, state_c_inf = trained_model.get_layer('decoder_lstm')(
        dec_emb_inf, initial_state=[decoder_state_input_h, decoder_state_input_c])

    # Attention
    attention_output_inf = trained_model.get_layer('attention')(
        [decoder_outputs_inf, encoder_outputs_input, encoder_outputs_input])

    # Final layers
    combined_inf = trained_model.get_layer('concatenate')([decoder_outputs_inf, attention_output_inf])
    decoder_outputs_final = trained_model.get_layer('output_dense')(combined_inf)

    decoder_model = Model(
        [decoder_inputs, decoder_state_input_h, decoder_state_input_c, encoder_outputs_input],
        [decoder_outputs_final, state_h_inf, state_c_inf])

    return encoder_model, decoder_model

def translate_sentence(encoder_model, decoder_model, input_seq, max_len_decode=40):
    # Encode the input sentence
    encoder_outputs, state_h, state_c = encoder_model.predict(input_seq, verbose=0)

    # Initialize decoder input with start token (assuming token 1 is start)
    target_seq = np.array([[1]])

    decoded_tokens = []

    for _ in range(max_len_decode):
        # Predict next token
        output_tokens, h, c = decoder_model.predict(
            [target_seq, state_h, state_c, encoder_outputs], verbose=0)

        # Sample token with highest probability
        sampled_token_index = np.argmax(output_tokens[0, -1, :])

        # Exit if hits padding token or end token
        if sampled_token_index == 0:
            break

        decoded_tokens.append(sampled_token_index)

        # Update target sequence and states for next iteration
        target_seq = np.array([[sampled_token_index]])
        state_h, state_c = h, c

    return decoded_tokens

encoder_model, decoder_model = create_inference_models(model)

smoothie = SmoothingFunction().method4
pred_sentences = []
true_sentences = []

# Lookup for decoding token IDs to words
vocab_fr = text_vectorizer_fr.get_vocabulary()
vocab_fr_lookup = dict(enumerate(vocab_fr))

# Use validation data for evaluation
val_X = text_vectorizer_en(np.array(val_df["en"][:100]))

for i in range(10):
    input_seq = val_X[i:i+1]

    # Generate translation using proper inference
    pred_ids = translate_sentence(encoder_model, decoder_model, input_seq)

    # Convert token IDs to strings
    pred_tokens_raw = []
    for tok_id in pred_ids:
        # Skip padding tokens
        if tok_id != 0:
            # Get word, empty string if not found
            word = vocab_fr_lookup.get(tok_id, '')
            pred_tokens_raw.append(word)

    # Remove empty strings and whitespace-only tokens
    pred_tokens = []
    for token in pred_tokens_raw:
        # Check if token has non-whitespace content
        if token.strip():
            pred_tokens.append(token)

    # Get true reference
    true_tokens = val_df["fr"].iloc[i].split()

    pred_sentences.append(pred_tokens)
    true_sentences.append(true_tokens)

    print(f"Example {i+1}:")
    print(f"English: {val_df['en'].iloc[i]}")
    print(f"True French: {' '.join(true_tokens)}")
    print(f"Predicted: {' '.join(pred_tokens)}")
    print("========================================")

# BLEU calculation
bleu_scores = []
for i in range(len(true_sentences)):
    ref = true_sentences[i]
    pred = pred_sentences[i]
    if pred:  # Only calculate if we have predictions
        bleu = sentence_bleu([ref], pred, smoothing_function=smoothie)
        bleu_scores.append(bleu)
print("Average BLEU Score: ", np.mean(bleu_scores))

Epoch 1/10
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 129ms/step - accuracy: 0.6892 - loss: 2.8720 - val_accuracy: 0.7292 - val_loss: 1.7343
Epoch 2/10
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 126ms/step - accuracy: 0.7328 - loss: 1.6505 - val_accuracy: 0.7545 - val_loss: 1.4496
Epoch 3/10
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 123ms/step - accuracy: 0.7582 - loss: 1.3683 - val_accuracy: 0.7671 - val_loss: 1.3082
Epoch 4/10
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 125ms/step - accuracy: 0.7684 - loss: 1.2226 - val_accuracy: 0.7778 - val_loss: 1.2303
Epoch 5/10
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 129ms/step - accuracy: 0.7787 - loss: 1.1093 - val_accuracy: 0.7823 - val_loss: 1.1840
Epoch 6/10
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 126ms/step - accuracy: 0.7890 - loss: 1.0129 - val_accuracy: 0.7867 - val_loss: 1.1509
Epoch 7/10

# Part 4

In [5]:
# Transformer hyperparameters
d_model = 64
num_heads = 2
dff= 128
num_layers = 2
vocab_size = 10000
max_len = 40

class PositionalEncoding(tf.keras.layers.Layer):
    def __init__(self, d_model, max_length=5000):
        super().__init__()
        self.d_model = d_model

        # Precompute the positional encodings
        pe = np.zeros((max_length, d_model))
        position = np.arange(0, max_length)[:, np.newaxis]
        div_term = np.exp(np.arange(0, d_model, 2) * -(np.log(10000.0) / d_model))
        pe[:, 0::2] = np.sin(position * div_term)
        pe[:, 1::2] = np.cos(position * div_term)
        self.pe = tf.constant(pe[np.newaxis, :, :], dtype=tf.float32)

    def call(self, x):
        # Add positional encoding to input embeddings
        return x + self.pe[:, :tf.shape(x)[1], :]

class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        self.depth = d_model // num_heads

        # Linear layers for Q, K, V
        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)

        # Output linear projection
        self.dense = tf.keras.layers.Dense(d_model)

    def split_heads(self, x, batch_size):
        # Split the last dimension into (num_heads, depth)
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, v, k, q, mask=None):
        batch_size = tf.shape(q)[0]

        # Apply linear projections
        q = self.wq(q)
        k = self.wk(k)
        v = self.wv(v)

        # Split into heads
        q = self.split_heads(q, batch_size)
        k = self.split_heads(k, batch_size)
        v = self.split_heads(v, batch_size)

        # Scaled dot-product attention
        d_k = tf.cast(tf.shape(q)[-1], tf.float32)
        scores = tf.matmul(q, k, transpose_b=True) / tf.sqrt(d_k)

        if mask is not None:
          scores += (mask)

        # Attention weights
        weights = tf.nn.softmax(scores, axis=-1)
        # Apply attention weights to values
        output = tf.matmul(weights, v)

        # Concatenate heads and project
        output = tf.transpose(output, perm=[0, 2, 1, 3])
        output = tf.reshape(output, (batch_size, -1, self.d_model))
        return self.dense(output)

class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, is_decoder=False):
        super().__init__()
        self.is_decoder = is_decoder
        # Self-attention
        self.mha1 = MultiHeadAttention(d_model, num_heads)

        # Cross-attention (only for decoder)
        if is_decoder:
          self.mha2 = MultiHeadAttention(d_model, num_heads)

        # Feed-forward network
        self.ffn = tf.keras.Sequential([
            tf.keras.layers.Dense(dff, activation='relu'),
            tf.keras.layers.Dense(d_model)
        ])

        # Layer normalizations
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        if is_decoder:
            self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        else:
            self.layernorm3 = None

    def call(self, x, enc_output=None, look_ahead_mask=None, padding_mask=None, training=None):
        # Self-attention
        attn1 = self.mha1(x, x, x, look_ahead_mask if self.is_decoder else padding_mask)
        x = self.layernorm1(x + attn1)

        # Cross-attention (decoder only)
        if self.is_decoder and enc_output is not None:
            attn2 = self.mha2(enc_output, enc_output, x, padding_mask)
            x = self.layernorm2(x + attn2)

        # FFN
        ffn_output = self.ffn(x)
        return self.layernorm3(x + ffn_output) if self.is_decoder else self.layernorm2(x + ffn_output)

class SimplifiedTransformer(tf.keras.Model):
    def __init__(self):
        super().__init__()
        # Input Embeddings + Positional Encoding
        self.enc_embedding = tf.keras.layers.Embedding(vocab_size, d_model)
        self.dec_embedding = tf.keras.layers.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model)

        # Transformer blocks
        self.encoder_layers = []
        for i in range(num_layers):
            encoder_layer = TransformerBlock(d_model, num_heads, dff, is_decoder=False)
            self.encoder_layers.append(encoder_layer)

        # Create decoder layers
        self.decoder_layers = []
        for i in range(num_layers):
            decoder_layer = TransformerBlock(d_model, num_heads, dff, is_decoder=True)
            self.decoder_layers.append(decoder_layer)

        self.final_layer = tf.keras.layers.Dense(vocab_size)

    def call(self, inputs, training=None):
        inp, tar = inputs

        # Encoder
        x = self.enc_embedding(inp) * tf.sqrt(tf.cast(d_model, tf.float32))
        x = self.pos_encoding(x)
        enc_mask = tf.cast(tf.math.equal(inp, 0), tf.float32)[:, tf.newaxis, tf.newaxis, :]

        # Encoder padding
        for layer in self.encoder_layers:
            x = layer(x, padding_mask=enc_mask, training=training)
        enc_output = x

        # Decoder
        x = self.dec_embedding(tar) * tf.sqrt(tf.cast(d_model, tf.float32))
        x = self.pos_encoding(x)

        look_ahead_mask = 1 - tf.linalg.band_part(tf.ones((tf.shape(tar)[1], tf.shape(tar)[1])), -1, 0)
        dec_mask = tf.cast(tf.math.equal(tar, 0), tf.float32)[:, tf.newaxis, tf.newaxis, :]
        combined_mask = tf.maximum(dec_mask, look_ahead_mask)

        for layer in self.decoder_layers:
            x = layer(x, enc_output=enc_output, look_ahead_mask=combined_mask,
                     padding_mask=enc_mask, training=training)

        return self.final_layer(x)


In [6]:

# Tokenize data
tokenizer_en = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer_fr = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer_en.fit_on_texts(train_df["en"])
tokenizer_fr.fit_on_texts(train_df["fr"])

train_en_seq = pad_sequences(tokenizer_en.texts_to_sequences(train_df["en"]), maxlen=max_len, padding='post')
train_fr_seq = pad_sequences(tokenizer_fr.texts_to_sequences(train_df["fr"]), maxlen=max_len, padding='post')

# Transformer model
transformer = SimplifiedTransformer()

def masked_loss(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = tf.keras.losses.sparse_categorical_crossentropy(real, pred, from_logits=True)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

transformer.compile(optimizer='adam', loss=masked_loss, metrics=['accuracy'])

# Train
transformer.fit([train_en_seq, train_fr_seq[:, :-1]], train_fr_seq[:, 1:],
                batch_size=64, epochs=8, verbose=1)

# Evaluate BLEU
def translate(sentence):
    seq = pad_sequences(tokenizer_en.texts_to_sequences([sentence]), maxlen=max_len, padding='post')
    output = tf.expand_dims([1], 0)

    for _ in range(max_len):
        pred = transformer([seq, output], training=False)
        pred_id = tf.cast(tf.argmax(pred[:, -1:, :], axis=-1), tf.int32)
        if pred_id == 0:
          break
        output = tf.concat([output, pred_id], axis=-1)

    return tf.squeeze(output, axis=0)[1:].numpy()

# Calculate BLEU
smoothie = SmoothingFunction().method4
bleu_scores = []
fr_word_index = {v: k for k, v in tokenizer_fr.word_index.items()}

for i in range(10):
    pred_ids = translate(val_df["en"].iloc[i])

    pred_words = []

    for token_id in pred_ids:
        if token_id != 0:
            # Get the word for this ID, or '' if not found
            word = fr_word_index.get(token_id, '')
            pred_words.append(word)

    true_words = val_df["fr"].iloc[i].split()

    if pred_words:
        bleu_scores.append(sentence_bleu([true_words], pred_words, smoothing_function=smoothie))

transformer_bleu = np.mean(bleu_scores)

print(f"Average BLEU Score: {transformer_bleu:.4f}")

Epoch 1/8
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 66ms/step - accuracy: 0.0177 - loss: 7.3083
Epoch 2/8
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 16ms/step - accuracy: 0.1420 - loss: 3.6167
Epoch 3/8
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - accuracy: 0.2263 - loss: 1.6384
Epoch 4/8
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.2577 - loss: 0.9032
Epoch 5/8
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - accuracy: 0.2731 - loss: 0.5721
Epoch 6/8
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - accuracy: 0.2818 - loss: 0.3718
Epoch 7/8
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - accuracy: 0.2846 - loss: 0.2527
Epoch 8/8
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - accuracy: 0.2888 - loss: 0.1686
Average BLEU Score: 0.0015


## Analysis

The model in Part 4 performs worse than the model in Part 2. Although, the model in Part 4 sholud be performing better than the model in Part 2. It might be doing worse due to the fact that the validation set I used was relatively small. Increasing the size might make it perform better, but my computer takes too long to run this. Furthermore, I noticed that for each epoch, the model in Part 4 completed each epoch significantly faster than the model in part 2.