<a href="https://colab.research.google.com/github/ANS1514/Projects_in_AI-ML/blob/main/HW5Task3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Parts 1 + 2
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Concatenate, Layer, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import random
import nltk
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction, sentence_bleu
# For part 3 + 4
import math
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader


Part 1 (10 points): Implement the scaled dot-product attention as discussed in class
(lecture 14) from scratch (use NumPy and pandas only, no deep learning libraries are
allowed for this step).

In [2]:
def softmax(x):
    e_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return e_x / np.sum(e_x, axis=-1, keepdims=True)

def scaled_dot_product_attention(Q, K, V):
    d_k = Q.shape[-1]
    scores = np.matmul(Q, K.transpose(0, 2, 1)) / np.sqrt(d_k)
    attention_weights = softmax(scores)
    output = np.matmul(attention_weights, V)

    return output, attention_weights

Part 2 (10 points): Pick any encoder-decoder seq2seq model (as discussed in class) and
integrate the scaled dot-product attention in the encoder architecture. You may come
up with your own technique of integration or adopt one from literature. Hint: See
Bahdanau or Luong attention paper presented in class (lecture 14)

In [4]:
class ScaledDotProductAttention(Layer):
    def __init__(self, **kwargs):
        super(ScaledDotProductAttention, self).__init__(**kwargs)

    def call(self, inputs):
        Q, K, V = inputs
        d_k = tf.cast(tf.shape(K)[-1], tf.float32)
        # Scaled dot-product
        scores = tf.matmul(Q, K, transpose_b=True) / tf.math.sqrt(d_k)
        # Apply softmax
        attn_weights = tf.nn.softmax(scores, axis=-1)
        output = tf.matmul(attn_weights, V)
        return output

Part 3 (5 points): Pick any public dataset of your choice (use a small-scale dataset like a
subset of the Tatoeba or Multi30k dataset) for machine translation task. Train your
model from Part 2 for the machine translation task. Evaluate test set by reporting the
BLEU Score

In [6]:
# Data Processing
nltk.download('punkt')

data = pd.read_csv("/content/tatoeba.tsv", sep="\t", header=None, quoting=3)
data.columns = ["id_src", "eng", "id_tgt", "de"]

data_shuffled = data.sample(frac=1, random_state=42)
data_unique = data_shuffled.drop_duplicates(subset=["eng"]).reset_index(drop=True)

subset_size = 10000
data_subset = data_unique.head(subset_size)
print("Number of samples in our subset:", len(data_subset))

data_pairs = list(zip(data_subset["eng"], data_subset["de"]))

# Shuffle and split
random.shuffle(data_pairs)
split_idx = int(0.8 * len(data_pairs))
train_pairs = data_pairs[:split_idx]
test_pairs = data_pairs[split_idx:]

# Function to preprocess sentences
def preprocess_sentence(sentence, is_target=False):
    sentence = sentence.lower().strip()
    if is_target:
        sentence = '<start> ' + sentence + ' <end>'
    return sentence

# Preprocess data
train_src = [preprocess_sentence(pair[0]) for pair in train_pairs]
train_tgt = [preprocess_sentence(pair[1], is_target=True) for pair in train_pairs]
test_src = [preprocess_sentence(pair[0]) for pair in test_pairs]
test_tgt = [preprocess_sentence(pair[1], is_target=True) for pair in test_pairs]

# Create tokenizers for source and target
src_tokenizer = Tokenizer(filters='')
src_tokenizer.fit_on_texts(train_src)
tgt_tokenizer = Tokenizer(filters='')
tgt_tokenizer.fit_on_texts(train_tgt)

# Vocabulary sizes
src_vocab_size = len(src_tokenizer.word_index) + 1
tgt_vocab_size = len(tgt_tokenizer.word_index) + 1

# Convert sentences to sequences
train_src_seq = src_tokenizer.texts_to_sequences(train_src)
train_tgt_seq = tgt_tokenizer.texts_to_sequences(train_tgt)
test_src_seq = src_tokenizer.texts_to_sequences(test_src)
test_tgt_seq = tgt_tokenizer.texts_to_sequences(test_tgt)

# Pad sequences to the maximum length found in training
max_src_length = max(len(seq) for seq in train_src_seq)
max_tgt_length = max(len(seq) for seq in train_tgt_seq)

train_src_seq = pad_sequences(train_src_seq, maxlen=max_src_length, padding='post')
train_tgt_seq = pad_sequences(train_tgt_seq, maxlen=max_tgt_length, padding='post')
test_src_seq = pad_sequences(test_src_seq, maxlen=max_src_length, padding='post')
test_tgt_seq = pad_sequences(test_tgt_seq, maxlen=max_tgt_length, padding='post')

train_decoder_input = train_tgt_seq[:, :-1]
train_decoder_output = train_tgt_seq[:, 1:]
test_decoder_input = test_tgt_seq[:, :-1]
test_decoder_output = test_tgt_seq[:, 1:]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Number of samples in our subset: 10000


In [7]:
# hyperparameters
embedding_dim = 64
lstm_units = 64

# Encoder
encoder_inputs = Input(shape=(max_src_length,), name='encoder_inputs')
encoder_embedding = Embedding(src_vocab_size, embedding_dim, mask_zero=False, name='encoder_embedding')(encoder_inputs) # Change mask_zero to False
encoder_lstm, state_h, state_c = LSTM(lstm_units, return_sequences=True, return_state=True, name='encoder_lstm', use_cudnn=False)(encoder_embedding) # Disable cuDNN
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(max_tgt_length - 1,), name='decoder_inputs')
decoder_embedding = Embedding(tgt_vocab_size, embedding_dim, mask_zero=False, name='decoder_embedding')(decoder_inputs) # Change mask_zero to False
decoder_lstm, _, _ = LSTM(lstm_units, return_sequences=True, return_state=True, name='decoder_lstm', use_cudnn=False)(
    decoder_embedding, initial_state=encoder_states)

# ... rest of your code ...
# Apply attention
attn_out = ScaledDotProductAttention(name='attention')([decoder_lstm, encoder_lstm, encoder_lstm])
decoder_combined_context = Concatenate(axis=-1, name='concat_layer')([decoder_lstm, attn_out])

# Final dense layer for predicting target tokens
decoder_dense = Dense(tgt_vocab_size, activation='softmax', name='decoder_dense')
decoder_outputs = decoder_dense(decoder_combined_context)

# Define the training model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

print("Model summary:")
model.summary()

Model summary:


Part 3 (5 points): Pick any public dataset of your choice (use a small-scale dataset like a
subset of the Tatoeba or Multi30k dataset) for machine translation task. Train your
model from Part 2 for the machine translation task. Evaluate test set by reporting the
BLEU Score

In [8]:
train_decoder_output = np.expand_dims(train_decoder_output, -1)
test_decoder_output = np.expand_dims(test_decoder_output, -1)

# Train the model
history = model.fit([train_src_seq, train_decoder_input], train_decoder_output,
                    batch_size=32,
                    epochs=25,
                    validation_split=0.2)


predictions = model.predict([test_src_seq, test_decoder_input])
predicted_sequences = np.argmax(predictions, axis=-1)

# Create inverse mapping
src_index_word = {v: k for k, v in src_tokenizer.word_index.items()}
tgt_index_word = {v: k for k, v in tgt_tokenizer.word_index.items()}

def sequence_to_text(seq, tokenizer_index_word):
    words = []
    for idx in seq:
        if idx == 0:
            continue
        word = tokenizer_index_word.get(idx, '')
        if word == '<end>':
            break
        words.append(word)
    return ' '.join(words)

# Decode predictions and references
predicted_sentences = [sequence_to_text(seq, tgt_index_word) for seq in predicted_sequences]
reference_sentences = [sequence_to_text(seq, tgt_index_word) for seq in test_tgt_seq]

# Prepare lists for corpus_bleu
reference_list = [[ref.split()] for ref in reference_sentences]
hypothesis_list = [pred.split() for pred in predicted_sentences]

smoothing_fn = SmoothingFunction().method1
bleu_score = corpus_bleu(reference_list, hypothesis_list, smoothing_function=smoothing_fn)
print("\nBLEU score on test set:", bleu_score)

print("\nExamples on test set:")
for i in range(len(test_src_seq)):
  if i < 10:
    print("Source:", sequence_to_text(test_src_seq[i], src_index_word))
    print("Reference:", sequence_to_text(test_tgt_seq[i], tgt_index_word))
    print("Prediction:", predicted_sentences[i])
    print("------")

Epoch 1/25
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 23ms/step - accuracy: 0.8562 - loss: 3.8103 - val_accuracy: 0.8982 - val_loss: 0.8034
Epoch 2/25
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - accuracy: 0.8982 - loss: 0.7832 - val_accuracy: 0.8994 - val_loss: 0.7878
Epoch 3/25
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - accuracy: 0.8984 - loss: 0.7606 - val_accuracy: 0.9001 - val_loss: 0.7756
Epoch 4/25
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - accuracy: 0.8998 - loss: 0.7310 - val_accuracy: 0.9007 - val_loss: 0.7688
Epoch 5/25
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - accuracy: 0.9017 - loss: 0.7074 - val_accuracy: 0.9023 - val_loss: 0.7612
Epoch 6/25
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - accuracy: 0.9017 - loss: 0.6998 - val_accuracy: 0.9027 - val_loss: 0.7557
Epoch 7/25
[1m200/20

Part 4 (30 points): In this part you are required to implement a simplified Transformer
model from scratch (using Python and NumPy/PyTorch/TensorFlow with minimal highlevel abstractions) and apply it to a machine translation task (e.g., English-to-French or
English-to-German translation) using the same dataset from part 3.


In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)

        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        seq_len = x.size(1)
        x = x + self.pe[:, :seq_len, :]
        return x


In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model=64, num_heads=2):
        super(MultiHeadAttention, self).__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        self.d_k = d_model // num_heads
        self.num_heads = num_heads

        self.linear_Q = nn.Linear(d_model, d_model)
        self.linear_K = nn.Linear(d_model, d_model)
        self.linear_V = nn.Linear(d_model, d_model)
        self.out_linear = nn.Linear(d_model, d_model)

    def forward(self, Q, K, V, mask=None):
        batch_size = Q.size(0)

        # Linear projections
        Q = self.linear_Q(Q)
        K = self.linear_K(K)
        V = self.linear_V(V)

        # Split into multiple heads and transpose
        Q = Q.view(batch_size, -1, self.num_heads, self.d_k).transpose(1,2)
        K = K.view(batch_size, -1, self.num_heads, self.d_k).transpose(1,2)
        V = V.view(batch_size, -1, self.num_heads, self.d_k).transpose(1,2)

        # Scaled dot-product attention for each head
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        attn_weights = torch.softmax(scores, dim=-1)
        attn_output = torch.matmul(attn_weights, V)

        # Concatenate heads
        attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, -1, self.num_heads * self.d_k)
        output = self.out_linear(attn_output)
        return output


In [None]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model=64, num_heads=2, d_ff=128, dropout=0.1):
        super(EncoderLayer, self).__init__()
        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, d_model)
        )
        self.layernorm1 = nn.LayerNorm(d_model)
        self.layernorm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        # Multi-head attention sublayer
        attn_output = self.mha(x, x, x, mask)
        x = self.layernorm1(x + self.dropout(attn_output))

        # Feedforward network
        ffn_output = self.ffn(x)
        x = self.layernorm2(x + self.dropout(ffn_output))
        return x


In [None]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model=64, num_heads=2, d_ff=128, dropout=0.1):
        super(DecoderLayer, self).__init__()
        self.mha1 = MultiHeadAttention(d_model, num_heads)
        self.mha2 = MultiHeadAttention(d_model, num_heads)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, d_model)
        )
        self.layernorm1 = nn.LayerNorm(d_model)
        self.layernorm2 = nn.LayerNorm(d_model)
        self.layernorm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask=None, tgt_mask=None):
        # Masked self-attention
        attn1 = self.mha1(x, x, x, mask=tgt_mask)
        x = self.layernorm1(x + self.dropout(attn1))

        # Encoder-decoder attention
        attn2 = self.mha2(x, enc_output, enc_output, mask=src_mask)
        x = self.layernorm2(x + self.dropout(attn2))

        # Feedforward
        ffn_output = self.ffn(x)
        x = self.layernorm3(x + self.dropout(ffn_output))
        return x


In [None]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model=64, num_heads=2, d_ff=128, num_encoder_layers=2, num_decoder_layers=2, dropout=0.1, max_len=100):
        super(Transformer, self).__init__()
        self.src_embedding = nn.Embedding(src_vocab_size, d_model)
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model, max_len)
        self.pos_decoder = PositionalEncoding(d_model, max_len)

        # Encoder layers
        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_encoder_layers)])

        # Decoder layers
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_decoder_layers)])

        self.final_linear = nn.Linear(d_model, tgt_vocab_size)

    def generate_square_subsequent_mask(self, sz):
        # Mask to ensure that position i can only attend to positions ≤ i
        mask = torch.triu(torch.ones(sz, sz), diagonal=1).bool()
        return mask.unsqueeze(0).unsqueeze(0)

    def forward(self, src, tgt):
        # src, tgt: (batch_size, seq_len)
        src_mask = None
        tgt_mask = self.generate_square_subsequent_mask(tgt.size(1)).to(tgt.device)

        # Embedding and positional encoding
        src = self.pos_encoder(self.src_embedding(src))
        tgt = self.pos_decoder(self.tgt_embedding(tgt))

        # Encoder forward
        for layer in self.encoder_layers:
            src = layer(src, mask=src_mask)
        enc_output = src

        # Decoder forward
        x = tgt
        for layer in self.decoder_layers:
            x = layer(x, enc_output, src_mask=src_mask, tgt_mask=tgt_mask)

        # Final linear layer and softmax
        output = self.final_linear(x)
        return output


In [None]:
class TranslationDataset(Dataset):
    def __init__(self, src_seqs, tgt_seqs):
        self.src_seqs = src_seqs
        self.tgt_seqs = tgt_seqs

    def __len__(self):
        return len(self.src_seqs)

    def __getitem__(self, idx):
        src = torch.tensor(self.src_seqs[idx], dtype=torch.long)
        tgt = torch.tensor(self.tgt_seqs[idx], dtype=torch.long)
        tgt_input = tgt[:-1]
        tgt_output = tgt[1:]
        return src, tgt_input, tgt_output

# Training and test datasets.
train_dataset = TranslationDataset(train_src_seq, train_tgt_seq)
test_dataset = TranslationDataset(test_src_seq, test_tgt_seq)

batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

In [None]:
max_len_model = max(max_src_length, max_tgt_length)
print("Using max_len_model:", max_len_model)

Using max_len_model: 71


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Transformer(src_vocab_size, tgt_vocab_size, d_model=64, num_heads=2, d_ff=128,
                    num_encoder_layers=2, num_decoder_layers=2, dropout=0.1, max_len=max_len_model).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)

# training loop
num_epochs = 25
model.train()
for epoch in range(num_epochs):
    epoch_loss = 0
    for src, tgt_input, tgt_output in train_loader:
        optimizer.zero_grad()

        src = src.to(device)
        tgt_input = tgt_input.to(device)
        tgt_output = tgt_output.to(device)

        # Forward pass
        output = model(src, tgt_input)
        # Reshape outputs
        output = output.view(-1, tgt_vocab_size)
        tgt_output = tgt_output.view(-1)

        loss = criterion(output, tgt_output)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    scheduler.step()
    print(f"Epoch {epoch+1} Loss: {epoch_loss/len(train_loader):.4f}")


model.eval()
smoothing_fn = SmoothingFunction().method1
bleu_scores = []
for src, tgt_input, tgt_output in test_loader:
    src = src.to(device)
    tgt_input = tgt_input.to(device)
    tgt_output = tgt_output.to(device)

    decoder_input = tgt_input[:, :1]
    max_len = max_tgt_length - 1
    for i in range(max_len):
        output = model(src, decoder_input)
        next_token = output[:, -1, :].argmax(dim=-1, keepdim=True)
        decoder_input = torch.cat([decoder_input, next_token], dim=1)
        if (next_token == 2).all():
            break

    ref = tgt_output.squeeze().tolist()
    hyp = decoder_input.squeeze().tolist()

    ref = [str(x) for x in ref]
    hyp = [str(x) for x in hyp]

    bleu = sentence_bleu([ref], hyp, smoothing_function=smoothing_fn)
    bleu_scores.append(bleu)

print("Average BLEU score on test set: {:.4f}".format(sum(bleu_scores)/len(bleu_scores)))

Epoch 1 Loss: 6.9957
Epoch 2 Loss: 5.0911
Epoch 3 Loss: 3.7838
Epoch 4 Loss: 2.9185
Epoch 5 Loss: 2.2848
Epoch 6 Loss: 1.8330
Epoch 7 Loss: 1.6045
Epoch 8 Loss: 1.4098
Epoch 9 Loss: 1.2305
Epoch 10 Loss: 1.0687
Epoch 11 Loss: 0.9275
Epoch 12 Loss: 0.8563
Epoch 13 Loss: 0.7872
Epoch 14 Loss: 0.7258
Epoch 15 Loss: 0.6656
Epoch 16 Loss: 0.6096
Epoch 17 Loss: 0.5803
Epoch 18 Loss: 0.5547
Epoch 19 Loss: 0.5271
Epoch 20 Loss: 0.5020
Epoch 21 Loss: 0.4758
Epoch 22 Loss: 0.4636
Epoch 23 Loss: 0.4516
Epoch 24 Loss: 0.4386
Epoch 25 Loss: 0.4273
Average BLEU score on test set: 0.0000
