<a href="https://colab.research.google.com/github/CDFire/ProjectsInAI-ML/blob/main/HW5/ProjectsInAIML_HW5_Task3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Part 1

In [28]:
import numpy as np
import pandas as pd
import math

In [26]:
def softmax(x):
    e_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return e_x / np.sum(e_x, axis=-1, keepdims=True)

def scaled_dot_product_attention(Q, K, V):
    d_k = Q.shape[-1]
    scores = np.dot(Q, K.T)
    scaled_scores = scores / np.sqrt(d_k)
    attention_weights = softmax(scaled_scores)
    output = np.dot(attention_weights, V)

    return output, attention_weights

#Part 2

In [20]:
def rnn_cell(x, h_prev, W_x, W_h, b):
    return np.tanh(np.dot(x, W_x) + np.dot(h_prev, W_h) + b)

In [234]:
class Encoder:
    def __init__(self, input_dim, hidden_dim):
        self.hidden_dim = hidden_dim
        self.W_x = np.random.randn(input_dim, hidden_dim)
        self.W_h = np.random.randn(hidden_dim, hidden_dim)
        self.b = np.zeros(hidden_dim)

    def forward(self, inputs):
        T = inputs.shape[0]
        h = np.zeros(self.hidden_dim)
        hidden_states = []
        for t in range(T):
            h = rnn_cell(inputs[t], h, self.W_x, self.W_h, self.b)
            hidden_states.append(h)
        hidden_states = np.stack(hidden_states, axis=0)
        return hidden_states

    def forward_with_attention(self, inputs):
        encoder_hidden = self.forward(inputs)
        context, attn_weights = scaled_dot_product_attention(encoder_hidden, encoder_hidden, encoder_hidden)
        final_context = np.mean(context, axis=0)
        return final_context, attn_weights, encoder_hidden


In [235]:
class Decoder:
    def __init__(self, emb_dim, hidden_dim, output_dim):
        self.hidden_dim = hidden_dim
        self.emb_dim = emb_dim
        self.output_dim = output_dim

        self.W_x = np.random.randn(emb_dim, hidden_dim)
        self.W_h = np.random.randn(hidden_dim, hidden_dim)
        self.b = np.zeros(hidden_dim)

        self.W_out = np.random.randn(hidden_dim, output_dim)
        self.b_out = np.zeros(output_dim)

    def forward(self, initial_state, inputs):
        T = inputs.shape[0]
        h = initial_state
        logits_seq = []
        for t in range(T):
            h = rnn_cell(inputs[t], h, self.W_x, self.W_h, self.b)
            logits = np.dot(h, self.W_out) + self.b_out
            logits_seq.append(logits)
        return np.stack(logits_seq, axis=0), h

#Part 3

In [183]:
data_df = pd.read_csv("Sentence pairs in English-French - 2025-03-19.tsv", sep="\t", header=None,
                        names=["src_id", "english", "tgt_id", "french"])
data_df = data_df.sample(n=1000).reset_index(drop=True)
print("Columns in TSV file:", data_df.columns)

Columns in TSV file: Index(['src_id', 'english', 'tgt_id', 'french'], dtype='object')


In [184]:
data = list(zip(data_df['english'], data_df['french']))

In [185]:
split_index = int(0.9 * len(data))
train_data = data[:split_index]
test_data = data[split_index:]

In [186]:
def build_vocab(sentences):
    vocab = {"<pad>":0, "<sos>":1, "<eos>":2, "<unk>":3}
    idx = 4
    for sent in sentences:
        for word in sent.split():
            if word not in vocab:
                vocab[word] = idx
                idx += 1
    return vocab

In [187]:
src_sentences = [pair[0] for pair in train_data]
tgt_sentences = [pair[1] for pair in train_data]

src_vocab = build_vocab(src_sentences)
tgt_vocab = build_vocab(tgt_sentences)

inv_tgt_vocab = {i: w for w, i in tgt_vocab.items()}

In [188]:
emb_dim = 8
hidden_dim = 16
lr = 0.001
num_epochs = 3000

src_vocab_size = len(src_vocab)
tgt_vocab_size = len(tgt_vocab)

E_src = np.random.randn(src_vocab_size, emb_dim)
E_tgt = np.random.randn(tgt_vocab_size, emb_dim)

In [189]:
def tokenize(sentence, vocab, add_special_tokens=False):
    tokens = sentence.split()
    if add_special_tokens:
        tokens = ["<sos>"] + tokens + ["<eos>"]
    return [vocab.get(w, vocab["<unk>"]) for w in tokens]

def detokenize(token_ids, inv_vocab):
    words = []
    for tid in token_ids:
        word = inv_vocab.get(tid, "<unk>")
        if word in ["<sos>", "<eos>", "<pad>"]:
            continue
        words.append(word)
    return " ".join(words)

In [190]:
def cross_entropy_loss(logits, target_idx):
    probs = softmax(logits)
    loss = -np.log(probs[target_idx] + 1e-9)
    return loss, probs

In [191]:
def update_output_layer(decoder, h, logits, target_idx, lr):
    probs = softmax(logits)
    grad_logits = probs.copy()
    grad_logits[target_idx] -= 1
    grad_W_out = np.outer(h, grad_logits)
    grad_b_out = grad_logits
    decoder.W_out -= lr * grad_W_out
    decoder.b_out -= lr * grad_b_out

In [192]:
encoder = Encoder(emb_dim, hidden_dim)
decoder = Decoder(emb_dim, hidden_dim, tgt_vocab_size)

In [193]:
print("Starting training...")

best_loss = float('inf')
patience = 10
lr_decay = 0.5
patience_counter = 0

for epoch in range(num_epochs):
    total_loss = 0.0
    for src_sent, tgt_sent in train_data:
        src_indices = tokenize(src_sent, src_vocab, add_special_tokens=False)
        tgt_indices = tokenize(tgt_sent, tgt_vocab, add_special_tokens=True)

        src_embeds = np.array([E_src[idx] for idx in src_indices])
        tgt_embeds = np.array([E_tgt[idx] for idx in tgt_indices])

        context, attn_weights, encoder_hidden = encoder.forward_with_attention(src_embeds)

        logits_seq, last_h = decoder.forward(context, tgt_embeds[:-1])

        example_loss = 0.0
        T_dec = logits_seq.shape[0]
        for t in range(T_dec):
            loss, _ = cross_entropy_loss(logits_seq[t], tgt_indices[t+1])
            example_loss += loss
            update_output_layer(decoder, last_h, logits_seq[t], tgt_indices[t+1], lr)
        total_loss += example_loss

    if total_loss < best_loss:
        best_loss = total_loss
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            lr *= lr_decay
            if(lr < 1e-6):
                break
            print(f"Epoch {epoch+1}: Loss did not improve for {patience} epochs. Reducing learning rate to {lr:.6f}.")
            patience_counter = 0

    if (epoch+1) % 10 == 0:
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss:.4f}")

Starting training...
Epoch 10/3000, Loss: 90469.1265
Epoch 20/3000, Loss: 86848.3948
Epoch 30/3000, Loss: 84440.5693
Epoch 40/3000, Loss: 83000.1977
Epoch 50/3000, Loss: 82237.7232
Epoch 60/3000, Loss: 81844.4776
Epoch 70/3000, Loss: 81707.9366
Epoch 80/3000, Loss: 81775.5020
Epoch 81: Loss did not improve for 10 epochs. Reducing learning rate to 0.000500.
Epoch 90/3000, Loss: 81888.5729
Epoch 91: Loss did not improve for 10 epochs. Reducing learning rate to 0.000250.
Epoch 100/3000, Loss: 81963.5946
Epoch 101: Loss did not improve for 10 epochs. Reducing learning rate to 0.000125.
Epoch 110/3000, Loss: 82005.4445
Epoch 111: Loss did not improve for 10 epochs. Reducing learning rate to 0.000063.
Epoch 120/3000, Loss: 82027.4205
Epoch 121: Loss did not improve for 10 epochs. Reducing learning rate to 0.000031.
Epoch 130/3000, Loss: 82038.6672
Epoch 131: Loss did not improve for 10 epochs. Reducing learning rate to 0.000016.
Epoch 140/3000, Loss: 82044.3547
Epoch 141: Loss did not improv

In [194]:
def greedy_decode(encoder, decoder, src_sentence, max_len=10):
    src_indices = tokenize(src_sentence, src_vocab, add_special_tokens=False)
    src_embeds = np.array([E_src[idx] for idx in src_indices])
    context, _, _ = encoder.forward_with_attention(src_embeds)
    current_token = tgt_vocab["<sos>"]
    decoded_tokens = []
    h = context
    for _ in range(max_len):
        token_embed = E_tgt[current_token]
        h = rnn_cell(token_embed, h, decoder.W_x, decoder.W_h, decoder.b)
        logits = np.dot(h, decoder.W_out) + decoder.b_out
        probs = softmax(logits)
        current_token = np.argmax(probs)
        if current_token == tgt_vocab["<eos>"]:
            break
        decoded_tokens.append(current_token)
    return detokenize(decoded_tokens, inv_tgt_vocab)


In [195]:
def compute_bleu(reference, candidate):
    ref_tokens = reference.split()
    cand_tokens = candidate.split()

    if len(cand_tokens) == 0:
        return 0.0

    ref_counts = {}
    for token in ref_tokens:
        ref_counts[token] = ref_counts.get(token, 0) + 1
    cand_counts = {}
    for token in cand_tokens:
        cand_counts[token] = cand_counts.get(token, 0) + 1

    overlap = 0
    for token in cand_counts:
        overlap += min(cand_counts[token], ref_counts.get(token, 0))
    precision = overlap / len(cand_tokens)

    bp = 1.0 if len(cand_tokens) >= len(ref_tokens) else math.exp(1 - len(ref_tokens) / len(cand_tokens))
    bleu = bp * precision
    return bleu

In [196]:
bleu_scores = []
print("\n--- Test Set Evaluation ---")
for src_sent, ref_sent in test_data:
    pred_sent = greedy_decode(encoder, decoder, src_sent)
    bleu = compute_bleu(ref_sent, pred_sent)
    bleu_scores.append(bleu)
    print(f"Source: {src_sent}")
    print(f"Reference: {ref_sent}")
    print(f"Prediction: {pred_sent}")
    print(f"BLEU-1: {bleu:.4f}\n")


--- Test Set Evaluation ---
Source: Do you want to play a game?
Reference: Voulez-vous jouer à un jeu ?
Prediction: 
BLEU-1: 0.0000

Source: I think we've met before.
Reference: Je pense que nous nous sommes déjà rencontrés.
Prediction: 
BLEU-1: 0.0000

Source: Do you remember your grandfather?
Reference: Vous souvenez-vous de votre grand-père ?
Prediction: sexuellement à composantes à ne à mouches de la n'aurais
BLEU-1: 0.1000

Source: She wrapped herself in a wool blanket.
Reference: Elle s'est enroulée dans une couverture en laine.
Prediction: que Tom que Nous pas regardait
BLEU-1: 0.0000

Source: It's a custom to celebrate Christmas.
Reference: C'est une coutume de célébrer Noël.
Prediction: 
BLEU-1: 0.0000

Source: The price doesn't matter.
Reference: Le prix n'est pas important.
Prediction: que Tom que a pas de
BLEU-1: 0.1667

Source: Your mother is worried sick about you.
Reference: Ta mère est morte d'inquiétude à ton sujet.
Prediction: 
BLEU-1: 0.0000

Source: Would you pardo

In [197]:
avg_bleu = np.mean(bleu_scores)
print(f"Average BLEU-1 Score on Test Set: {avg_bleu:.4f}")

Average BLEU-1 Score on Test Set: 0.0148


#Part 4

In [278]:
import torch
import torch.nn as nn
import torch.optim as optim

In [279]:
def build_vocab(sentences):
    vocab = {"<pad>":0, "<sos>":1, "<eos>":2, "<unk>":3}
    idx = 4
    for sent in sentences:
        for word in sent.split():
            if word not in vocab:
                vocab[word] = idx
                idx += 1
    return vocab

In [280]:
data_df = pd.read_csv("Sentence pairs in English-French - 2025-03-19.tsv", sep="\t", header=None,
                        names=["src_id", "english", "tgt_id", "french"])
data_df = data_df.sample(n=10000).reset_index(drop=True)
data = list(zip(data_df['english'], data_df['french']))
split_index = int(0.9 * len(data))
train_data = data[:split_index]
test_data = data[split_index:]

In [281]:
src_sentences = [pair[0] for pair in train_data]
tgt_sentences = [pair[1] for pair in train_data]

src_vocab = build_vocab(src_sentences)
tgt_vocab = build_vocab(tgt_sentences)

inv_tgt_vocab = {i: w for w, i in tgt_vocab.items()}

In [282]:
def tokenize(sentence, vocab):
    return [vocab.get(tok, vocab["<unk>"]) for tok in sentence.strip().split()]

src_data = [tokenize(sent, src_vocab) for sent in src_sentences]
tgt_data = [tokenize(sent, tgt_vocab) for sent in tgt_sentences]

train_size = int(len(src_data)*0.8)
src_train, src_val = src_data[:train_size], src_data[train_size:]
tgt_train, tgt_val = tgt_data[:train_size], tgt_data[train_size:]

In [283]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))

        pe = torch.zeros(max_len, d_model)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        seq_len = x.size(1)
        x = x + self.pe[:, :seq_len, :]
        return x

In [284]:
def softmax(x):
    e_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return e_x / np.sum(e_x, axis=-1, keepdims=True)

def scaled_dot_product_attention(Q, K, V, mask=None):
    d_k = Q.size(-1)
    scores = torch.matmul(Q, K.transpose(-2, -1)) / np.sqrt(d_k)

    if mask is not None:
        scores = scores.masked_fill(mask == 0, float('-inf'))

    attn = torch.softmax(scores, dim=-1)
    output = torch.matmul(attn, V)
    return output, attn

In [285]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model=64, n_heads=2):
        super(MultiHeadAttention, self).__init__()
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_k = d_model // n_heads

        self.W_q = nn.Linear(d_model, d_model, bias=False)
        self.W_k = nn.Linear(d_model, d_model, bias=False)
        self.W_v = nn.Linear(d_model, d_model, bias=False)

        self.W_o = nn.Linear(d_model, d_model, bias=False)

    def forward(self, Q, K, V, mask=None):
        batch_size = Q.size(0)

        Q = self.W_q(Q)
        K = self.W_k(K)
        V = self.W_v(V)

        Q = Q.view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)
        K = K.view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)
        V = V.view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)

        if mask is not None:
            mask = mask.unsqueeze(1)

        attn_output, attn = scaled_dot_product_attention(Q, K, V, mask=mask)

        attn_output = attn_output.transpose(1, 2).contiguous()
        attn_output = attn_output.view(batch_size, -1, self.d_model)

        output = self.W_o(attn_output)

        return output, attn

In [286]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model=64, dim_ff=128):
        super(PositionwiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, dim_ff)
        self.fc2 = nn.Linear(dim_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

In [287]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model=64, n_heads=2, dim_ff=128):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, n_heads)
        self.ff = PositionwiseFeedForward(d_model, dim_ff)

        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)

    def forward(self, x, src_mask=None):
        # x shape: (batch_size, src_len, d_model)

        # 1) Self-attention
        attn_output, _ = self.self_attn(x, x, x, mask=src_mask)
        x = x + attn_output  # Residual
        x = self.norm1(x)    # Layer norm

        # 2) Feed-forward
        ff_output = self.ff(x)
        x = x + ff_output
        x = self.norm2(x)

        return x

In [288]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model=64, n_heads=2, dim_ff=128):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, n_heads)
        self.cross_attn = MultiHeadAttention(d_model, n_heads)
        self.ff = PositionwiseFeedForward(d_model, dim_ff)

        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)

    def forward(self, x, enc_out, tgt_mask=None, src_mask=None):
        # 1) Masked self-attention in decoder
        _x, _ = self.self_attn(x, x, x, mask=tgt_mask)
        x = x + _x
        x = self.norm1(x)

        # 2) Cross-attention: Q=decoder states, K=encoder outputs, V=encoder outputs
        _x, attn = self.cross_attn(x, enc_out, enc_out, mask=src_mask)  # typically no mask for cross-attn unless padding
        x = x + _x
        x = self.norm2(x)

        # 3) Position-wise feed-forward
        _x = self.ff(x)
        x = x + _x
        x = self.norm3(x)

        return x, attn

In [289]:
class Encoder(nn.Module):
    def __init__(self, src_vocab_size, d_model=64, n_heads=2, dim_ff=128, num_layers=2, max_len=100):
        super(Encoder, self).__init__()
        self.d_model = d_model
        self.embedding = nn.Embedding(src_vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model, max_len)

        self.layers = nn.ModuleList([
            EncoderLayer(d_model, n_heads, dim_ff) for _ in range(num_layers)
        ])

    def forward(self, src, src_mask=None):
        # src shape: (batch_size, src_len)
        x = self.embedding(src) * np.sqrt(self.d_model)
        x = self.pos_encoding(x)

        for layer in self.layers:
            x = layer(x, src_mask)
        return x


class Decoder(nn.Module):
    def __init__(self, tgt_vocab_size, d_model=64, n_heads=2, dim_ff=128, num_layers=2, max_len=100):
        super(Decoder, self).__init__()
        self.d_model = d_model
        self.embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model, max_len)

        self.layers = nn.ModuleList([
            DecoderLayer(d_model, n_heads, dim_ff) for _ in range(num_layers)
        ])

    def forward(self, tgt, enc_out, tgt_mask=None, src_mask=None):
        # tgt shape: (batch_size, tgt_len)
        x = self.embedding(tgt) * np.sqrt(self.d_model)
        x = self.pos_encoding(x)

        attn_weights = None
        for layer in self.layers:
            x, attn_weights = layer(x, enc_out, tgt_mask, src_mask)
        return x, attn_weights

In [290]:
class Transformer(nn.Module):
    def __init__(
        self,
        src_vocab_size,
        tgt_vocab_size,
        d_model=64,
        n_heads=2,
        dim_ff=128,
        num_enc_layers=2,
        num_dec_layers=2,
        max_len=100
    ):
        super(Transformer, self).__init__()

        self.encoder = Encoder(
            src_vocab_size, d_model, n_heads, dim_ff, num_enc_layers, max_len
        )
        self.decoder = Decoder(
            tgt_vocab_size, d_model, n_heads, dim_ff, num_dec_layers, max_len
        )
        self.output_projection = nn.Linear(d_model, tgt_vocab_size)

    def generate_src_mask(self, src):
        pad_token = 0
        mask = (src != pad_token).unsqueeze(1)
        return mask

    def generate_tgt_mask(self, tgt):
        batch_size, tgt_len = tgt.size()
        pad_token = 0

        padding_mask = (tgt != pad_token).unsqueeze(1)

        subsequent_mask = torch.tril(torch.ones((tgt_len, tgt_len), device=tgt.device)).bool()
        subsequent_mask = subsequent_mask.unsqueeze(0)

        combined_mask = padding_mask & subsequent_mask
        return combined_mask

    def forward(self, src, tgt):
        src_mask = self.generate_src_mask(src)
        tgt_mask = self.generate_tgt_mask(tgt)

        enc_out = self.encoder(src, src_mask=src_mask)
        dec_out, attn_weights = self.decoder(tgt, enc_out, tgt_mask=tgt_mask, src_mask=src_mask)

        logits = self.output_projection(dec_out)
        return logits, attn_weights


In [291]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
SRC_VOCAB_SIZE = len(src_vocab)
TGT_VOCAB_SIZE = len(tgt_vocab)
BATCH_SIZE = 32
EPOCHS = 30
MAX_LEN = 50

model = Transformer(
    src_vocab_size=SRC_VOCAB_SIZE,
    tgt_vocab_size=TGT_VOCAB_SIZE,
    d_model=64,
    n_heads=2,
    dim_ff=128,
    num_enc_layers=2,
    num_dec_layers=2,
    max_len=MAX_LEN
).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=1e-3)


def pad_batch(sentences, pad_id=0, max_len=MAX_LEN):
    batch_size = len(sentences)
    padded = torch.full((batch_size, max_len), pad_id, dtype=torch.long)
    for i, seq in enumerate(sentences):
        length = min(len(seq), max_len)
        padded[i, :length] = torch.tensor(seq[:length], dtype=torch.long)
    return padded

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0.0

    for i in range(0, len(src_train), BATCH_SIZE):
        src_batch = src_train[i:i+BATCH_SIZE]
        tgt_batch = tgt_train[i:i+BATCH_SIZE]

        if len(src_batch) < BATCH_SIZE:
            break

        tgt_in = [[1] + seq for seq in tgt_batch]
        tgt_out = [seq + [2] for seq in tgt_batch]

        src_tensor = pad_batch(src_batch).to(device)
        tgt_in_tensor = pad_batch(tgt_in).to(device)
        tgt_out_tensor = pad_batch(tgt_out).to(device)

        optimizer.zero_grad()

        logits, _ = model(src_tensor, tgt_in_tensor)

        logits = logits.view(-1, TGT_VOCAB_SIZE)
        tgt_out_tensor = tgt_out_tensor.view(-1)

        loss = criterion(logits, tgt_out_tensor)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / (len(src_train)//BATCH_SIZE)
    print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {avg_loss:.4f}")

Epoch 1/30, Loss: 6.6931
Epoch 2/30, Loss: 5.3778
Epoch 3/30, Loss: 4.7565
Epoch 4/30, Loss: 4.2434
Epoch 5/30, Loss: 3.7912
Epoch 6/30, Loss: 3.3700
Epoch 7/30, Loss: 2.9655
Epoch 8/30, Loss: 2.5912
Epoch 9/30, Loss: 2.2466
Epoch 10/30, Loss: 1.9417
Epoch 11/30, Loss: 1.6806
Epoch 12/30, Loss: 1.4654
Epoch 13/30, Loss: 1.2867
Epoch 14/30, Loss: 1.1404
Epoch 15/30, Loss: 1.0112
Epoch 16/30, Loss: 0.8983
Epoch 17/30, Loss: 0.8050
Epoch 18/30, Loss: 0.7114
Epoch 19/30, Loss: 0.6294
Epoch 20/30, Loss: 0.5677
Epoch 21/30, Loss: 0.5036
Epoch 22/30, Loss: 0.4639
Epoch 23/30, Loss: 0.4178
Epoch 24/30, Loss: 0.3665
Epoch 25/30, Loss: 0.3357
Epoch 26/30, Loss: 0.3140
Epoch 27/30, Loss: 0.2892
Epoch 28/30, Loss: 0.2662
Epoch 29/30, Loss: 0.2557
Epoch 30/30, Loss: 0.2347


In [293]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

In [295]:
def greedy_decode(model, src, max_len=50, start_token=1, end_token=2):
    model.eval()

    batch_size = src.size(0)
    tgt_generated = torch.full((batch_size, 1), start_token, dtype=torch.long, device=src.device)

    for _ in range(max_len):
        logits, _ = model(src, tgt_generated)
        next_token_logits = logits[:, -1, :]
        next_tokens = next_token_logits.argmax(dim=-1, keepdim=True)

        tgt_generated = torch.cat([tgt_generated, next_tokens], dim=1)

    return tgt_generated.tolist()

    if count == 0:
        return 0.0

    return total_bleu / count

In [298]:
def compute_bleu(model, src_val, tgt_val, batch_size=32, max_len=50):
    model.eval()
    smoothie = SmoothingFunction().method1

    total_bleu = 0.0
    count = 0

    with torch.no_grad():
        for i in range(0, len(src_val), batch_size):
            src_batch = src_val[i : i + batch_size]
            ref_batch = tgt_val[i : i + batch_size]

            if len(src_batch) < 1:
                break

            src_tensor = pad_batch(src_batch).to(device)

            generated_batch = greedy_decode(model, src_tensor, max_len=max_len)

            for gen_ids, ref_ids in zip(generated_batch, ref_batch):

                if 2 in gen_ids:
                    eos_idx = gen_ids.index(2)
                    gen_ids = gen_ids[1:eos_idx]
                else:
                    gen_ids = gen_ids[1:]

                if 2 in ref_ids:
                    ref_eos_idx = ref_ids.index(2)
                    ref_ids = ref_ids[:ref_eos_idx]
                if 1 in ref_ids:
                    ref_ids = ref_ids[1:]

                reference = [ref_ids]
                hypothesis = gen_ids

                bleu_score = sentence_bleu(reference, hypothesis, smoothing_function=smoothie)
                total_bleu += bleu_score
                count += 1

    return total_bleu / count if count > 0 else 0.0

In [299]:
val_bleu = compute_bleu(model, src_val, tgt_val, batch_size=BATCH_SIZE, max_len=MAX_LEN)
print(f"Validation BLEU score: {val_bleu:.4f}")

Validation BLEU score: 0.0480
