In [62]:
# =============================================================================
# STEP 1: SETUP AND IMPORTS
# =============================================================================
# This cell installs necessary libraries, downloads the dataset, and imports modules.

!pip install torch torchtext==0.17.0 tqdm sacrebleu -q

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import unicodedata
import re
import random
from collections import Counter
from tqdm import tqdm
import math
import time
import sacrebleu

# Download and extract the English-Indonesian dataset
!wget -q http://www.manythings.org/anki/ind-eng.zip
!unzip -n -q ind-eng.zip

print("✅ Setup Complete. Dataset is ready.")


✅ Setup Complete. Dataset is ready.


In [63]:
# =============================================================================
# STEP 2: DATA PREPARATION
# =============================================================================
# This section contains all functions for loading, cleaning, and preparing the data.

# --- Define special tokens and their indices ---
SPECIALS = ["<pad>", "<bos>", "<eos>"]
PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2
UNK_IDX = None  # kita nggak pakai unk

def normalize_and_tokenize(s: str):
    """Cleans and tokenizes a string but keeps names and numbers."""
    s = s.strip()
    # Add space before punctuation
    s = re.sub(r"([.!?])", r" \1", s)
    # Replace multiple spaces with single space
    s = re.sub(r"\s+", " ", s).strip()
    return s.split()

def load_pairs(path, max_pairs=10000):
    """Loads, tokenizes, and filters sentence pairs from a file."""
    pairs = []
    with open(path, encoding="utf-8") as f:
        for line in f:
            cols = line.rstrip("\n").split("\t")
            if len(cols) < 2:
                continue
            src, tgt = cols[0], cols[1]  # English, Indonesian
            src_tokens = normalize_and_tokenize(src)
            tgt_tokens = normalize_and_tokenize(tgt)

            # --- Filter: buang kalimat aneh ---
            if not src_tokens or not tgt_tokens:
                continue
            if len(src_tokens) > 100 or len(tgt_tokens) > 100:
                continue
            if abs(len(src_tokens) - len(tgt_tokens)) > 50:
                continue

            pairs.append((src_tokens, tgt_tokens))

    random.shuffle(pairs)
    return pairs[:max_pairs]

def build_vocab(token_lists, min_freq=1):
    """Builds a vocabulary from a list of tokenized sentences."""
    counter = Counter(tok for tokens in token_lists for tok in tokens)
    vocab = {sp: i for i, sp in enumerate(SPECIALS)}
    for word, freq in counter.items():
        if freq >= min_freq:
            vocab[word] = len(vocab)
    itos = {i: w for w, i in vocab.items()}
    return vocab, itos

def to_ids(tokens, vocab):
    ids = []
    for t in tokens:
        if t in vocab:
            ids.append(vocab[t])
        else:
            # fallback: kasih <pad> (atau bisa tambahin ke vocab dinamis)
            ids.append(PAD_IDX)
    return [BOS_IDX] + ids + [EOS_IDX]

class NMTDataset(Dataset):
    """Custom PyTorch Dataset for NMT."""
    def __init__(self, pairs, src_vocab, trg_vocab):
        self.data = []
        for src, trg in pairs:
            src_ids = torch.tensor(to_ids(src, src_vocab), dtype=torch.long)
            trg_ids = torch.tensor(to_ids(trg, trg_vocab), dtype=torch.long)
            self.data.append((src_ids, trg_ids))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

def collate_batch(batch):
    """Pads sequences in a batch to the same length."""
    src_list, trg_list = [], []
    for _src, _trg in batch:
        src_list.append(_src)
        trg_list.append(_trg)
    src_pad = nn.utils.rnn.pad_sequence(src_list, padding_value=PAD_IDX)
    trg_pad = nn.utils.rnn.pad_sequence(trg_list, padding_value=PAD_IDX)
    return src_pad, trg_pad

# --- Execute Data Preparation ---
pairs = load_pairs("ind.txt", max_pairs=15000)

# Split data: 80% train, 10% validation, 10% test
n_train = int(len(pairs) * 0.8)
n_val = int(len(pairs) * 0.1)
train_pairs, val_pairs, test_pairs = pairs[:n_train], pairs[n_train:n_train+n_val], pairs[n_train+n_val:]

# Build vocabularies from training data
en_vocab, en_itos = build_vocab([p[0] for p in train_pairs])
id_vocab, id_itos = build_vocab([p[1] for p in train_pairs])

# Create DataLoaders
BATCH_SIZE = 64
train_loader = DataLoader(NMTDataset(train_pairs, en_vocab, id_vocab), batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
val_loader = DataLoader(NMTDataset(val_pairs, en_vocab, id_vocab), batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)
test_loader = DataLoader(NMTDataset(test_pairs, en_vocab, id_vocab), batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)

print(f"✅ Data prepared: {len(train_pairs)} train, {len(val_pairs)} val, {len(test_pairs)} test pairs.")
print(f"   English vocab: {len(en_vocab)} | Indonesian vocab: {len(id_vocab)}")

✅ Data prepared: 11904 train, 1488 val, 1489 test pairs.
   English vocab: 5627 | Indonesian vocab: 6904


In [64]:
# =============================================================================
# STEP 3: MODEL DEFINITIONS
# =============================================================================
# This section contains the PyTorch classes for both the RNN and Transformer models.


# -----------------------------------------------------
# 3.0 Helper untuk Transformer
# -----------------------------------------------------
def generate_square_subsequent_mask(sz: int):
    """Generate a square mask for the sequence. Masked positions are filled with -inf."""
    mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask


# -----------------------------------------------------
# 3.1 Baseline: RNN with Bahdanau Attention
# -----------------------------------------------------
class BahdanauEncoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=PAD_IDX)
        self.gru = nn.GRU(emb_dim, enc_hid_dim, bidirectional=True)
        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, hidden = self.gru(embedded)
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)))
        return outputs, hidden


class BahdanauAttention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim)
        self.v = nn.Linear(dec_hid_dim, 1, bias=False)

    def forward(self, hidden, encoder_outputs):
        batch_size = encoder_outputs.shape[1]
        src_len = encoder_outputs.shape[0]
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        attention = self.v(energy).squeeze(2)
        return torch.softmax(attention, dim=1)


class BahdanauDecoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
        super().__init__()
        self.output_dim = output_dim
        self.attention = attention
        self.embedding = nn.Embedding(output_dim, emb_dim, padding_idx=PAD_IDX)
        self.gru = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)
        self.fc_out = nn.Linear((enc_hid_dim * 2) + dec_hid_dim + emb_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, encoder_outputs):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        a = self.attention(hidden, encoder_outputs).unsqueeze(1)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        weighted = torch.bmm(a, encoder_outputs).permute(1, 0, 2)
        rnn_input = torch.cat((embedded, weighted), dim=2)
        output, hidden = self.gru(rnn_input, hidden.unsqueeze(0))
        prediction = self.fc_out(torch.cat((output.squeeze(0), weighted.squeeze(0), embedded.squeeze(0)), dim=1))
        return prediction, hidden.squeeze(0)


class Seq2SeqRNN(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        trg_len, batch_size = trg.shape
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        encoder_outputs, hidden = self.encoder(src)
        input = trg[0,:]
        for t in range(1, trg_len):
            output, hidden = self.decoder(input, hidden, encoder_outputs)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[t] if teacher_force else top1
        return outputs

    def greedy_decode(self, src, max_len=50):
        with torch.no_grad():
            encoder_outputs, hidden = self.encoder(src)
            ys = torch.ones(1, src.shape[1]).fill_(BOS_IDX).long().to(self.device)
            for _ in range(max_len - 1):
                input_t = ys[-1, :]
                output, hidden = self.decoder(input_t, hidden, encoder_outputs)
                pred_token = output.argmax(1)
                ys = torch.cat([ys, pred_token.unsqueeze(0)], dim=0)
                if (pred_token == EOS_IDX).all(): break
        return ys


# -----------------------------------------------------
# 3.2 Advanced: Transformer
# -----------------------------------------------------
class PositionalEncoding(nn.Module):
    def __init__(self, emb_size: int, dropout: float, maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2) * math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(1)   # [maxlen, 1, emb_size]

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: torch.Tensor):
        # token_embedding shape: [seq_len, batch_size, emb_size]
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])


class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size, emb_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size, padding_idx=PAD_IDX)
        self.emb_size = emb_size

    def forward(self, tokens):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)


class Seq2SeqTransformer(nn.Module):
    def __init__(self, num_enc_layers, num_dec_layers, emb_size, nhead,
                 src_vocab_size, tgt_vocab_size, dim_feedforward=512, dropout=0.1):
        super().__init__()
        self.transformer = nn.Transformer(d_model=emb_size, nhead=nhead,
                                          num_encoder_layers=num_enc_layers,
                                          num_decoder_layers=num_dec_layers,
                                          dim_feedforward=dim_feedforward, dropout=dropout)
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(emb_size, dropout=dropout)

    def forward(self, src, tgt, src_mask, tgt_mask,
                src_padding_mask, tgt_padding_mask, memory_key_padding_mask):
        src = src.to(device)
        tgt = tgt.to(device)

        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(tgt))
        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None,
                                src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)

    def beam_search_decode(self, src, src_mask, max_len, start_symbol, beam_size=3):
        src = src.to(device)
        src_mask = src_mask.to(device)

        memory = self.transformer.encoder(self.positional_encoding(self.src_tok_emb(src)), src_mask)

        sequences = [[list([start_symbol]), 0.0]]

        for _ in range(max_len):
            all_candidates = []
            for seq, score in sequences:
                ys = torch.tensor(seq).unsqueeze(1).to(device)
                tgt_mask = generate_square_subsequent_mask(ys.size(0)).to(device)
                out = self.transformer.decoder(self.positional_encoding(self.tgt_tok_emb(ys)),
                                               memory, tgt_mask)
                out = out.transpose(0, 1)
                prob = self.generator(out[:, -1])
                log_probs = torch.log_softmax(prob, dim=1)

                topk_log_probs, topk_words = torch.topk(log_probs, beam_size)

                for i in range(beam_size):
                    candidate = [seq + [topk_words[0][i].item()],
                                 score - topk_log_probs[0][i].item()]
                    all_candidates.append(candidate)

            ordered = sorted(all_candidates, key=lambda tup: tup[1])
            sequences = ordered[:beam_size]

        return sequences[0][0]

    def greedy_decode(self, src, max_len=50, start_symbol=BOS_IDX):
        src = src.to(device)
        src_mask = torch.zeros((src.size(0), src.size(0)), device=device).type(torch.bool)

        # encode source
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        memory = self.transformer.encoder(src_emb, src_key_padding_mask=None)

        # mulai dengan <bos>
        ys = torch.ones(1, src.size(1), dtype=torch.long, device=device).fill_(start_symbol)

        for _ in range(max_len-1):
            tgt_emb = self.positional_encoding(self.tgt_tok_emb(ys))
            tgt_mask = generate_square_subsequent_mask(ys.size(0)).to(device)

            out = self.transformer.decoder(tgt_emb, memory, tgt_mask)
            out = self.generator(out)
            next_word = out[-1].argmax(dim=1)
            ys = torch.cat([ys, next_word.unsqueeze(0)], dim=0)

            if (next_word == EOS_IDX).all():
                break

        return ys

print("✅ Model classes defined.")

✅ Model classes defined.


In [65]:
# =============================================================================
# STEP 4: TRAINING & EVALUATION UTILITIES
# =============================================================================
# This section contains helper functions for training, evaluation, and decoding.

def create_mask(src, tgt, device):
    """Creates masks for the Transformer model."""
    src_seq_len, tgt_seq_len = src.shape[0], tgt.shape[0]
    tgt_mask = nn.Transformer.generate_square_subsequent_mask(tgt_seq_len, device)
    src_mask = torch.zeros((src_seq_len, src_seq_len), device=device).type(torch.bool)
    src_padding_mask = (src == PAD_IDX).transpose(0, 1)
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

def train_epoch(model, loader, optimizer, criterion, clip, is_transformer=False):
    model.train()
    epoch_loss = 0
    for src, trg in tqdm(loader, desc="Training"):
        src, trg = src.to(device), trg.to(device)
        optimizer.zero_grad()
        if is_transformer:
            trg_input = trg[:-1, :]
            src_mask, tgt_mask, src_pad_mask, tgt_pad_mask = create_mask(src, trg_input, device)
            logits = model(src, trg_input, src_mask, tgt_mask, src_pad_mask, tgt_pad_mask, src_pad_mask)
            trg_out = trg[1:, :].reshape(-1)
            logits = logits.reshape(-1, logits.shape[-1])
        else: # RNN
            logits = model(src, trg)
            # FIX: Slice logits to match target shape, avoiding the ValueError
            trg_out = trg[1:, :].reshape(-1)
            logits = logits[1:].reshape(-1, logits.shape[-1])
        loss = criterion(logits, trg_out)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(loader)

def evaluate_epoch(model, loader, criterion, is_transformer=False):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for src, trg in tqdm(loader, desc="Evaluating"):
            src, trg = src.to(device), trg.to(device)
            if is_transformer:
                trg_input = trg[:-1, :]
                src_mask, tgt_mask, src_pad_mask, tgt_pad_mask = create_mask(src, trg_input, device)
                logits = model(src, trg_input, src_mask, tgt_mask, src_pad_mask, tgt_pad_mask, src_pad_mask)
                trg_out = trg[1:, :].reshape(-1)
                logits = logits.reshape(-1, logits.shape[-1])
            else: # RNN
                logits = model(src, trg, teacher_forcing_ratio=0.0)
                # FIX: Slice logits to match target shape
                trg_out = trg[1:, :].reshape(-1)
                logits = logits[1:].reshape(-1, logits.shape[-1])
            loss = criterion(logits, trg_out)
            epoch_loss += loss.item()
    return epoch_loss / len(loader)

def decode_ids(ids, itos):
    """Converts a tensor of IDs back to a string."""
    tokens = []
    for tok_id in ids:
        tok = tok_id.item()
        if tok == EOS_IDX: break
        if tok not in {BOS_IDX, PAD_IDX}:
            tokens.append(itos.get(tok, "<unk>"))
    return " ".join(tokens)

def calculate_bleu(model, loader, id_itos, device):
    """Calculates SacreBLEU score for the model on a given dataset."""
    model.eval()
    hypotheses, references = [], []
    with torch.no_grad():
        for src, trg in loader:
            src, trg = src.to(device), trg.to(device)
            pred_ids = model.greedy_decode(src)
            for b in range(src.size(1)):
                hypotheses.append(decode_ids(pred_ids[:, b], id_itos))
                references.append([decode_ids(trg[:, b], id_itos)])
    return sacrebleu.corpus_bleu(hypotheses, references).score

print("✅ Utility functions defined.")




✅ Utility functions defined.


In [66]:
# =============================================================================
# STEP 5: MAIN EXECUTION
# =============================================================================
# This is the main block to instantiate and train the models.

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"🚀 Using device: {device}")

# --- Hyperparameters ---
N_EPOCHS = 20
CLIP = 1.0
LEARNING_RATE = 0.0005

# --- Train and Evaluate RNN Baseline ---
print("\n--- Training Baseline RNN + Attention ---")
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
ENC_HID_DIM = 512
DEC_HID_DIM = 512
DROPOUT = 0.5

attn_rnn = BahdanauAttention(ENC_HID_DIM, DEC_HID_DIM)
# FIX: Corrected DEC_HID_dim to DEC_HID_DIM
encoder_rnn = BahdanauEncoder(len(en_vocab), ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DROPOUT)
# FIX: Corrected DEC_HID_dim to DEC_HID_DIM
decoder_rnn = BahdanauDecoder(len(id_vocab), DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DROPOUT, attn_rnn)
model_rnn = Seq2SeqRNN(encoder_rnn, decoder_rnn, device).to(device)

optimizer_rnn = optim.Adam(model_rnn.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

for epoch in range(N_EPOCHS):
    train_loss = train_epoch(model_rnn, train_loader, optimizer_rnn, criterion, CLIP)
    val_loss = evaluate_epoch(model_rnn, val_loader, criterion)
    print(f"Epoch {epoch+1:02} | Train Loss: {train_loss:.3f} | Val Loss: {val_loss:.3f}")

# --- Train and Evaluate Transformer ---
print("\n--- Training Transformer ---")
EMB_SIZE = 512
NHEAD = 8
FFN_HID_DIM = 512
NUM_ENC_LAYERS = 3
NUM_DEC_LAYERS = 3

model_transformer = Seq2SeqTransformer(NUM_ENC_LAYERS, NUM_DEC_LAYERS, EMB_SIZE, NHEAD,
                                       len(en_vocab), len(id_vocab), FFN_HID_DIM).to(device)
optimizer_transformer = optim.Adam(model_transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

for epoch in range(N_EPOCHS):
    train_loss = train_epoch(model_transformer, train_loader, optimizer_transformer, criterion, CLIP, is_transformer=True)
    val_loss = evaluate_epoch(model_transformer, val_loader, criterion, is_transformer=True)
    print(f"Epoch {epoch+1:02} | Train Loss: {train_loss:.3f} | Val Loss: {val_loss:.3f}")

# --- Final Evaluation ---
print("\n--- Final Evaluation on Test Set ---")
bleu_rnn = calculate_bleu(model_rnn, test_loader, id_itos, device)
bleu_transformer = calculate_bleu(model_transformer, test_loader, id_itos, device)
print(f"🏆 Final BLEU Score (RNN Baseline): {bleu_rnn:.2f}")
print(f"🏆 Final BLEU Score (Transformer): {bleu_transformer:.2f}")

# --- Show Example Translations ---
def show_examples(model, loader, en_itos, id_itos, n=3):
    print("\n--- Example Translations ---")
    model.eval()
    with torch.no_grad():
        for i, (src, trg) in enumerate(loader):
            if i >= n: break
            src, trg = src.to(device), trg.to(device)
            pred_ids = model.greedy_decode(src)
            src_text = decode_ids(src[:, 0], en_itos)
            trg_text = decode_ids(trg[:, 0], id_itos)
            pred_text = decode_ids(pred_ids[:, 0], id_itos)
            print(f"\n  SRC:  {src_text}")
            print(f"  TRG:  {trg_text}")
            print(f"  PRED: {pred_text}")

show_examples(model_transformer, test_loader, en_itos, id_itos)

🚀 Using device: cuda

--- Training Baseline RNN + Attention ---


Training: 100%|██████████| 186/186 [00:12<00:00, 15.30it/s]
Evaluating: 100%|██████████| 24/24 [00:00<00:00, 40.57it/s]


Epoch 01 | Train Loss: 4.984 | Val Loss: 3.991


Training: 100%|██████████| 186/186 [00:12<00:00, 15.25it/s]
Evaluating: 100%|██████████| 24/24 [00:00<00:00, 41.04it/s]


Epoch 02 | Train Loss: 3.672 | Val Loss: 3.278


Training: 100%|██████████| 186/186 [00:12<00:00, 15.17it/s]
Evaluating: 100%|██████████| 24/24 [00:00<00:00, 40.92it/s]


Epoch 03 | Train Loss: 2.890 | Val Loss: 2.919


Training: 100%|██████████| 186/186 [00:12<00:00, 15.41it/s]
Evaluating: 100%|██████████| 24/24 [00:00<00:00, 41.10it/s]


Epoch 04 | Train Loss: 2.331 | Val Loss: 2.676


Training: 100%|██████████| 186/186 [00:12<00:00, 15.37it/s]
Evaluating: 100%|██████████| 24/24 [00:00<00:00, 40.16it/s]


Epoch 05 | Train Loss: 1.893 | Val Loss: 2.616


Training: 100%|██████████| 186/186 [00:12<00:00, 15.24it/s]
Evaluating: 100%|██████████| 24/24 [00:00<00:00, 41.35it/s]


Epoch 06 | Train Loss: 1.574 | Val Loss: 2.597


Training: 100%|██████████| 186/186 [00:12<00:00, 15.24it/s]
Evaluating: 100%|██████████| 24/24 [00:00<00:00, 41.15it/s]


Epoch 07 | Train Loss: 1.330 | Val Loss: 2.619


Training: 100%|██████████| 186/186 [00:12<00:00, 15.18it/s]
Evaluating: 100%|██████████| 24/24 [00:00<00:00, 41.11it/s]


Epoch 08 | Train Loss: 1.162 | Val Loss: 2.616


Training: 100%|██████████| 186/186 [00:12<00:00, 15.06it/s]
Evaluating: 100%|██████████| 24/24 [00:00<00:00, 40.71it/s]


Epoch 09 | Train Loss: 1.041 | Val Loss: 2.651


Training: 100%|██████████| 186/186 [00:12<00:00, 15.16it/s]
Evaluating: 100%|██████████| 24/24 [00:00<00:00, 41.44it/s]


Epoch 10 | Train Loss: 0.916 | Val Loss: 2.693


Training: 100%|██████████| 186/186 [00:12<00:00, 15.22it/s]
Evaluating: 100%|██████████| 24/24 [00:00<00:00, 40.34it/s]


Epoch 11 | Train Loss: 0.821 | Val Loss: 2.684


Training: 100%|██████████| 186/186 [00:12<00:00, 15.09it/s]
Evaluating: 100%|██████████| 24/24 [00:00<00:00, 39.61it/s]


Epoch 12 | Train Loss: 0.750 | Val Loss: 2.710


Training: 100%|██████████| 186/186 [00:12<00:00, 15.19it/s]
Evaluating: 100%|██████████| 24/24 [00:00<00:00, 40.48it/s]


Epoch 13 | Train Loss: 0.679 | Val Loss: 2.795


Training: 100%|██████████| 186/186 [00:12<00:00, 15.14it/s]
Evaluating: 100%|██████████| 24/24 [00:00<00:00, 40.34it/s]


Epoch 14 | Train Loss: 0.622 | Val Loss: 2.823


Training: 100%|██████████| 186/186 [00:12<00:00, 14.98it/s]
Evaluating: 100%|██████████| 24/24 [00:00<00:00, 40.57it/s]


Epoch 15 | Train Loss: 0.558 | Val Loss: 2.925


Training: 100%|██████████| 186/186 [00:12<00:00, 15.22it/s]
Evaluating: 100%|██████████| 24/24 [00:00<00:00, 40.48it/s]


Epoch 16 | Train Loss: 0.518 | Val Loss: 2.939


Training: 100%|██████████| 186/186 [00:12<00:00, 15.10it/s]
Evaluating: 100%|██████████| 24/24 [00:00<00:00, 40.51it/s]


Epoch 17 | Train Loss: 0.481 | Val Loss: 2.941


Training: 100%|██████████| 186/186 [00:12<00:00, 14.99it/s]
Evaluating: 100%|██████████| 24/24 [00:00<00:00, 40.56it/s]


Epoch 18 | Train Loss: 0.441 | Val Loss: 3.039


Training: 100%|██████████| 186/186 [00:12<00:00, 15.21it/s]
Evaluating: 100%|██████████| 24/24 [00:00<00:00, 40.23it/s]


Epoch 19 | Train Loss: 0.403 | Val Loss: 3.078


Training: 100%|██████████| 186/186 [00:12<00:00, 15.23it/s]
Evaluating: 100%|██████████| 24/24 [00:00<00:00, 40.55it/s]


Epoch 20 | Train Loss: 0.383 | Val Loss: 3.073

--- Training Transformer ---


Training: 100%|██████████| 186/186 [00:06<00:00, 29.31it/s]
Evaluating: 100%|██████████| 24/24 [00:00<00:00, 105.80it/s]


Epoch 01 | Train Loss: 5.270 | Val Loss: 4.351


Training: 100%|██████████| 186/186 [00:06<00:00, 29.23it/s]
Evaluating: 100%|██████████| 24/24 [00:00<00:00, 112.05it/s]


Epoch 02 | Train Loss: 4.327 | Val Loss: 3.867


Training: 100%|██████████| 186/186 [00:06<00:00, 29.29it/s]
Evaluating: 100%|██████████| 24/24 [00:00<00:00, 112.48it/s]


Epoch 03 | Train Loss: 3.939 | Val Loss: 3.563


Training: 100%|██████████| 186/186 [00:06<00:00, 29.25it/s]
Evaluating: 100%|██████████| 24/24 [00:00<00:00, 111.90it/s]


Epoch 04 | Train Loss: 3.638 | Val Loss: 3.336


Training: 100%|██████████| 186/186 [00:06<00:00, 29.27it/s]
Evaluating: 100%|██████████| 24/24 [00:00<00:00, 112.28it/s]


Epoch 05 | Train Loss: 3.398 | Val Loss: 3.174


Training: 100%|██████████| 186/186 [00:06<00:00, 29.30it/s]
Evaluating: 100%|██████████| 24/24 [00:00<00:00, 112.89it/s]


Epoch 06 | Train Loss: 3.185 | Val Loss: 3.033


Training: 100%|██████████| 186/186 [00:06<00:00, 29.36it/s]
Evaluating: 100%|██████████| 24/24 [00:00<00:00, 113.02it/s]


Epoch 07 | Train Loss: 3.000 | Val Loss: 2.911


Training: 100%|██████████| 186/186 [00:06<00:00, 29.10it/s]
Evaluating: 100%|██████████| 24/24 [00:00<00:00, 113.52it/s]


Epoch 08 | Train Loss: 2.823 | Val Loss: 2.811


Training: 100%|██████████| 186/186 [00:06<00:00, 29.22it/s]
Evaluating: 100%|██████████| 24/24 [00:00<00:00, 112.29it/s]


Epoch 09 | Train Loss: 2.664 | Val Loss: 2.709


Training: 100%|██████████| 186/186 [00:06<00:00, 29.25it/s]
Evaluating: 100%|██████████| 24/24 [00:00<00:00, 113.15it/s]


Epoch 10 | Train Loss: 2.518 | Val Loss: 2.636


Training: 100%|██████████| 186/186 [00:06<00:00, 29.39it/s]
Evaluating: 100%|██████████| 24/24 [00:00<00:00, 112.23it/s]


Epoch 11 | Train Loss: 2.378 | Val Loss: 2.558


Training: 100%|██████████| 186/186 [00:06<00:00, 29.19it/s]
Evaluating: 100%|██████████| 24/24 [00:00<00:00, 112.64it/s]


Epoch 12 | Train Loss: 2.250 | Val Loss: 2.525


Training: 100%|██████████| 186/186 [00:06<00:00, 29.33it/s]
Evaluating: 100%|██████████| 24/24 [00:00<00:00, 113.24it/s]


Epoch 13 | Train Loss: 2.122 | Val Loss: 2.438


Training: 100%|██████████| 186/186 [00:06<00:00, 29.23it/s]
Evaluating: 100%|██████████| 24/24 [00:00<00:00, 111.67it/s]


Epoch 14 | Train Loss: 2.003 | Val Loss: 2.396


Training: 100%|██████████| 186/186 [00:06<00:00, 29.30it/s]
Evaluating: 100%|██████████| 24/24 [00:00<00:00, 113.27it/s]


Epoch 15 | Train Loss: 1.897 | Val Loss: 2.353


Training: 100%|██████████| 186/186 [00:06<00:00, 29.44it/s]
Evaluating: 100%|██████████| 24/24 [00:00<00:00, 111.76it/s]


Epoch 16 | Train Loss: 1.793 | Val Loss: 2.320


Training: 100%|██████████| 186/186 [00:06<00:00, 29.25it/s]
Evaluating: 100%|██████████| 24/24 [00:00<00:00, 113.07it/s]


Epoch 17 | Train Loss: 1.685 | Val Loss: 2.283


Training: 100%|██████████| 186/186 [00:06<00:00, 29.24it/s]
Evaluating: 100%|██████████| 24/24 [00:00<00:00, 111.57it/s]


Epoch 18 | Train Loss: 1.598 | Val Loss: 2.241


Training: 100%|██████████| 186/186 [00:06<00:00, 29.27it/s]
Evaluating: 100%|██████████| 24/24 [00:00<00:00, 112.79it/s]


Epoch 19 | Train Loss: 1.498 | Val Loss: 2.239


Training: 100%|██████████| 186/186 [00:06<00:00, 29.34it/s]
Evaluating: 100%|██████████| 24/24 [00:00<00:00, 111.36it/s]


Epoch 20 | Train Loss: 1.413 | Val Loss: 2.204

--- Final Evaluation on Test Set ---
🏆 Final BLEU Score (RNN Baseline): 37.99
🏆 Final BLEU Score (Transformer): 16.99

--- Example Translations ---

  SRC:  There's no grass on the moon .
  TRG:  Tidak ada rumput di bulan .
  PRED: Tidak ada buku sama sekali tidak ada buku untuk menonton TV sama sekali .

  SRC:  Tom said doing that would be a good idea .
  TRG:  Tom bilang, hal itu merupakan ide bagus
  PRED: Tom berkata bahwa dia akan melakukan itu dengan baik .

  SRC:  They went surfing .
  TRG:  Mereka pergi berselancar .
  PRED: Mereka pulang lebih tua di dekat dingin .


In [69]:
# =============================================================================
# STEP 6: Interactive Translation
# =============================================================================

def translate_interactive(sentence, model, src_vocab, trg_itos, device, is_transformer=False):
    model.eval()
    
    # Pre-process the input sentence
    src_tokens = normalize_and_tokenize(sentence)
    
    # Convert tokens to IDs
    src_ids = torch.tensor(to_ids(src_tokens, src_vocab), dtype=torch.long)
    src_tensor = src_ids.unsqueeze(1).to(device)

    # Perform greedy decode using the model's built-in method
    with torch.no_grad():
        if is_transformer:
            # For Transformer, we need to pass a different set of arguments for decoding
            pred_ids = model.greedy_decode(src_tensor)
        else: # RNN
            pred_ids = model.greedy_decode(src_tensor)
    
    # Decode the output IDs back to text
    translated_text = decode_ids(pred_ids[:, 0], trg_itos)
    return translated_text

# --- Interactive Test ---
print("\n--- Interactive Test ---")

kalimat_tes_1 = "I can do that."
kalimat_tes_2 = "You must do it."

# Menggunakan model Transformer
terjemahan_transformer_1 = translate_interactive(kalimat_tes_1, model_transformer, en_vocab, id_itos, device, is_transformer=True)
terjemahan_transformer_2 = translate_interactive(kalimat_tes_2, model_transformer, en_vocab, id_itos, device, is_transformer=True)

print(f"\nModel: Transformer")
print(f"English: {kalimat_tes_1}")
print(f"Indonesian: {terjemahan_transformer_1}")

print(f"\nEnglish: {kalimat_tes_2}")
print(f"Indonesian: {terjemahan_transformer_2}")

# Menggunakan model RNN (sebagai perbandingan)
terjemahan_rnn_1 = translate_interactive(kalimat_tes_1, model_rnn, en_vocab, id_itos, device)
print(f"\nModel: RNN Baseline")
print(f"English: {kalimat_tes_1}")
print(f"Indonesian: {terjemahan_rnn_1}")


--- Interactive Test ---

Model: Transformer
English: I can do that.
Indonesian: Aku bisa melakukan itu .

English: You must do it.
Indonesian: Kamu harus melakukannya .

Model: RNN Baseline
English: I can do that.
Indonesian: Aku bisa melakukannya .
