In [60]:
!ls -l /kaggle/input/translate6

total 60
-rw-r--r-- 1 nobody nogroup  3513 Aug 25 00:48 analisis.py
-rw-r--r-- 1 nobody nogroup  1109 Aug 25 00:48 attention.py
-rw-r--r-- 1 nobody nogroup  1521 Aug 25 00:48 decoder.py
-rw-r--r-- 1 nobody nogroup   908 Aug 25 00:48 encoder.py
-rw-r--r-- 1 nobody nogroup  9298 Aug 25 00:48 eval.py
-rw-r--r-- 1 nobody nogroup  1738 Aug 25 00:48 heatmap.py
-rw-r--r-- 1 nobody nogroup 11924 Aug 25 00:48 main.py
-rw-r--r-- 1 nobody nogroup  3754 Aug 25 00:48 seq2seq.py
-rw-r--r-- 1 nobody nogroup  2035 Aug 25 00:48 top_words.py
-rw-r--r-- 1 nobody nogroup  5281 Aug 25 00:48 util.py


In [61]:
import unicodedata
from collections import Counter
from pathlib import Path
import argparse
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
import math
import sacrebleu
import sys
import os
import csv # Tambahkan import ini untuk menyimpan riwayat CSV

# Definisikan konstanta global di luar fungsi
SPECIALS = ["<pad>", "<bos>", "<eos>", "<unk>"]
PAD, BOS, EOS, UNK = range(4)

# --- Class Attention, Encoder, Decoder (sudah ada) ---
class BahdanauAttentionQKV(nn.Module):
    def __init__(self, hidden_size, query_size, key_size, dropout_p=0.1):
        super().__init__()
        self.hidden_size = hidden_size
        self.Wa = nn.Linear(query_size, hidden_size)
        self.Wk = nn.Linear(key_size, hidden_size)
        self.V = nn.Linear(hidden_size, 1)
        self.dropout = nn.Dropout(p=dropout_p)

    def forward(self, query, keys, mask=None):
        query = self.Wa(query)
        keys = self.Wk(keys)
        scores = self.V(torch.tanh(query + keys))
        scores = scores.squeeze(-1)
        if mask is not None:
            scores.masked_fill_(mask, -float("inf"))
        return scores

class BahdanauEncoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, encoder_hidden_dim, decoder_hidden_dim, dropout_p=0.1):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx=PAD)
        self.rnn = nn.GRU(embedding_dim, encoder_hidden_dim, bidirectional=True)
        self.fc = nn.Linear(encoder_hidden_dim * 2, decoder_hidden_dim)
        self.dropout = nn.Dropout(p=dropout_p)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, hidden = self.rnn(embedded)
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)))
        return outputs, hidden

class BahdanauDecoder(nn.Module):
    def __init__(self, output_dim, embedding_dim, encoder_hidden_dim, decoder_hidden_dim, attention, dropout_p=0.1):
        super().__init__()
        self.output_dim = output_dim
        self.attention = attention
        self.embedding = nn.Embedding(output_dim, embedding_dim, padding_idx=PAD)
        self.rnn = nn.GRU(embedding_dim + encoder_hidden_dim * 2, decoder_hidden_dim)
        self.fc_out = nn.Linear(embedding_dim + decoder_hidden_dim + encoder_hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(p=dropout_p)
        
    def forward(self, x, hidden, encoder_outputs):
        embedded = self.dropout(self.embedding(x))
        encoder_outputs_T = encoder_outputs.transpose(0, 1)
        query_for_attn = hidden.unsqueeze(1)
        attn_weights = self.attention(query_for_attn, encoder_outputs_T)
        attn_weights = F.softmax(attn_weights, dim=1)
        weighted_context = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs_T).squeeze(1)
        rnn_input = torch.cat((embedded.squeeze(0), weighted_context), dim=1).unsqueeze(0)
        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
        output = output.squeeze(0)
        embedded = embedded.squeeze(0)
        prediction = self.fc_out(torch.cat((output, weighted_context, embedded), dim=1))
        return prediction.unsqueeze(0), hidden.squeeze(0), attn_weights

# ---- Class Model Seq2Seq dengan Beam Search (sudah ada) ----
class BahdanauSeq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device, pad_id, bos_id, eos_id):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        self.pad_id = pad_id
        self.bos_id = bos_id
        self.eos_id = eos_id

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        trg_len, batch_size = trg.size()
        outputs = torch.zeros(trg_len, batch_size, self.decoder.output_dim).to(self.device)
        encoder_outputs, hidden = self.encoder(src)
        trg_input = trg[0, :]

        for t in range(1, trg_len):
            output, hidden, _ = self.decoder(trg_input.unsqueeze(0), hidden, encoder_outputs)
            outputs[t] = output.squeeze(0)
            
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            top1 = output.argmax(2).squeeze(0)
            trg_input = trg[t] if teacher_force else top1
            
        return outputs, None

    def greedy_decode(self, src, max_len=40):
        batch_size = src.size(1)
        encoder_outputs, hidden = self.encoder(src)
        ys = torch.ones(1, batch_size, dtype=torch.long).fill_(self.bos_id).to(self.device)
        
        for _ in range(max_len - 1):
            y_tm1 = ys[-1].unsqueeze(0)
            output, hidden, _ = self.decoder(y_tm1, hidden, encoder_outputs)
            
            next_word_id = output.argmax(2)
            
            ys = torch.cat([ys, next_word_id], dim=0)

        return ys, None

    def beam_search_decode(self, src, max_len=40, beam_size=3):
        batch_size = src.size(1)
        encoder_outputs, hidden = self.encoder(src)
        
        hypotheses = torch.ones(1, batch_size, beam_size, dtype=torch.long).fill_(self.bos_id).to(self.device)
        hyp_scores = torch.zeros(batch_size, beam_size).to(self.device)
        
        hidden_beams = hidden.unsqueeze(1).repeat(1, beam_size, 1)
        hidden_beams = hidden_beams.view(batch_size * beam_size, -1)
        
        encoder_outputs_beams = encoder_outputs.transpose(0, 1).unsqueeze(1).repeat(1, beam_size, 1, 1).view(batch_size * beam_size, encoder_outputs.size(0), -1).transpose(0,1)

        for _ in range(max_len - 1):
            last_tokens = hypotheses[-1].view(-1, 1).transpose(0,1)
            output, hidden_beams, _ = self.decoder(last_tokens, hidden_beams.view(batch_size * beam_size, -1), encoder_outputs_beams)
            output = output.transpose(0, 1)
            output = F.log_softmax(output, dim=-1)
            
            cand_scores = hyp_scores.unsqueeze(2) + output.view(batch_size, beam_size, -1)
            
            cand_scores, cand_indices = cand_scores.view(batch_size, -1).topk(beam_size, dim=-1)
            
            hyp_scores = cand_scores
            
            prev_hyp_indices = cand_indices // self.decoder.output_dim
            new_token_indices = cand_indices % self.decoder.output_dim
            
            new_hypotheses = torch.zeros(hypotheses.size(0) + 1, batch_size, beam_size, dtype=torch.long).to(self.device)
            for i in range(hypotheses.size(0)):
                new_hypotheses[i] = torch.gather(hypotheses[i], 1, prev_hyp_indices)
            new_hypotheses[-1] = new_token_indices
            
            hypotheses = new_hypotheses
            
            eos_mask = (new_token_indices == self.eos_id)
            if eos_mask.all():
                break

        best_hyp_indices = hyp_scores.argmax(dim=1)
        final_hypotheses = torch.zeros(max_len, batch_size, dtype=torch.long).to(self.device)

        for b in range(batch_size):
            best_hyp = hypotheses[:, b, best_hyp_indices[b]]
            final_hypotheses[:len(best_hyp), b] = best_hyp
            final_hypotheses[len(best_hyp):, b] = self.pad_id

        return final_hypotheses.to(self.device)

# ---- Helper functions for data processing and evaluation (sudah ada) ----
def normalize(text):
    return unicodedata.normalize("NFKC", text.lower().strip())

def to_ids(tokens, vocab, unk_id=3, bos_id=1, eos_id=2):
    ids = [bos_id]
    for tok in tokens:
        ids.append(vocab.get(tok, unk_id))
    ids.append(eos_id)
    return ids

def decode_ids(ids, itos, bos_id=1, eos_id=2):
    tokens = []
    for i in ids:
        if i.item() == eos_id:
            break
        if i.item() != bos_id:
            tokens.append(itos[i.item()])
    return " ".join(tokens)

def collate_batch(batch):
    src_list, trg_list = [], []
    for src, trg in batch:
        src_list.append(src)
        trg_list.append(trg)
    src_padded = torch.nn.utils.rnn.pad_sequence(src_list, batch_first=True, padding_value=PAD)
    trg_padded = torch.nn.utils.rnn.pad_sequence(trg_list, batch_first=True, padding_value=PAD)
    return src_padded, trg_padded

def load_pairs(file_path, max_len=20, max_pairs=None):
    pairs = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if max_pairs and i >= max_pairs: break
            parts = line.strip().split('\t')
            src, trg = normalize(parts[0]).split(' '), normalize(parts[1]).split(' ')
            if len(src) < max_len and len(trg) < max_len:
                pairs.append((src, trg))
    return pairs

def split_pairs(pairs, train_ratio=0.8, val_ratio=0.1):
    n = len(pairs)
    n_train = int(n * train_ratio)
    n_val = int(n * val_ratio)
    return pairs[:n_train], pairs[n_train:n_train + n_val], pairs[n_train + n_val:]

def build_vocab(token_lists, min_freq=1, max_size=None, specials=SPECIALS):
    counter = Counter()
    for toks in token_lists:
        counter.update(toks)
    filtered = [(w, c) for w, c in counter.items() if c >= min_freq]
    filtered.sort(key=lambda x: (-x[1], x[0]))
    if max_size is not None:
        filtered = filtered[:max(0, max_size - len(specials))]
    vocab = {sp: i for i, sp in enumerate(specials)}
    for w, _ in filtered:
        if w not in vocab:
            vocab[w] = len(vocab)
    itos = {i: w for w, i in vocab.items()}
    return vocab, itos

def epoch_run(model, loader, criterion, optimizer, train=True, teacher_forcing=0.5):
    model.train() if train else model.eval()
    total_loss, total_tokens = 0.0, 0
    device = next(model.parameters()).device
    
    with torch.set_grad_enabled(train):
        for src, trg in tqdm(loader):
            src = src.to(device).T
            trg = trg.to(device).T
            
            outputs, _att = model(src, trg, teacher_forcing_ratio=teacher_forcing if train else 0.0)
            
            logits = outputs[1:].reshape(-1, outputs.size(-1))
            target = trg[1:].reshape(-1)
            
            loss = criterion(logits, target)
            
            if train:
                optimizer.zero_grad(set_to_none=True)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
            
            n_tokens = (target != PAD).sum().item()
            total_loss += loss.item() * n_tokens
            total_tokens += n_tokens
            
    avg_loss = total_loss / max(1, total_tokens)
    ppl = math.exp(avg_loss) if avg_loss < 20 else float("inf")
    return avg_loss, ppl

def evaluate_sacrebleu(model, loader, trg_itos=None, sp_trg=None, beam_size=1):
    model.eval()
    refs, hyps = [], []
    with torch.no_grad():
        for src, trg in tqdm(loader):
            src, trg = src.to(model.device).T, trg.to(model.device).T
            if beam_size > 1:
                ys = model.beam_search_decode(src, max_len=40, beam_size=beam_size)
            else:
                ys, _ = model.greedy_decode(src, max_len=40)
            
            if sp_trg:
                pass
            else:
                for y in ys.T.tolist():
                    hyps.append(decode_ids(torch.tensor(y), trg_itos))
            
            if sp_trg:
                pass
            else:
                for t in trg.T.tolist():
                    refs.append(decode_ids(torch.tensor(t), trg_itos))
    
    refs_sacrebleu = [[ref] for ref in refs]
    bleu = sacrebleu.corpus_bleu(hyps, refs_sacrebleu).score
    return bleu

# ---- NMTDataset class ----
class NMTDataset(Dataset):
    def __init__(self, pairs, src_vocab, trg_vocab):
        self.data = [(to_ids(src, src_vocab), to_ids(trg, trg_vocab)) for src, trg in pairs]
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        src_ids, trg_ids = self.data[idx]
        return torch.tensor(src_ids, dtype=torch.long), torch.tensor(trg_ids, dtype=torch.long)

# Fungsi untuk menjalankan satu eksperimen penuh
def run_experiment(args, en_vocab, id_vocab, train_loader, val_loader, test_loader, en_itos, id_itos):
    print(f"\n--- Running Experiment: Dropout={args.dropout}, Hidden Size={args.encoder_hidden_size}/{args.decoder_hidden_size} ---")
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    encoder = BahdanauEncoder(input_dim=len(en_vocab), embedding_dim=args.encoder_embedding_dim, encoder_hidden_dim=args.encoder_hidden_size, decoder_hidden_dim=args.decoder_hidden_size, dropout_p=args.dropout)
    attn = BahdanauAttentionQKV(hidden_size=args.decoder_hidden_size, query_size=args.decoder_hidden_size, key_size=2 * args.encoder_hidden_size, dropout_p=0.0)
    decoder = BahdanauDecoder(output_dim=len(id_vocab), embedding_dim=args.decoder_embedding_dim, encoder_hidden_dim=args.encoder_hidden_size, decoder_hidden_dim=args.decoder_hidden_size, attention=attn, dropout_p=args.dropout)
    
    seq2seq = BahdanauSeq2Seq(encoder, decoder, device, pad_id=PAD, bos_id=BOS, eos_id=EOS).to(device)
    criterion = nn.CrossEntropyLoss(ignore_index=PAD)
    optimizer = torch.optim.Adam(seq2seq.parameters(), lr=args.lr)
    
    history = {"train_loss": [], "val_loss": [], "train_ppl": [], "val_ppl": [], "val_bleu": []}
    EPOCHS = args.epochs
    best_val_bleu = -1.0 # Ubah dari val_loss ke val_bleu
    best_epoch = 0
    
    for epoch in range(1, EPOCHS + 1):
        tf = max(0.3, 0.7 - 0.04 * (epoch - 1))
        train_loss, train_ppl = epoch_run(seq2seq, train_loader, criterion, optimizer, train=True, teacher_forcing=tf)
        val_loss, val_ppl = epoch_run(seq2seq, val_loader, criterion, optimizer, train=False, teacher_forcing=0.0)
        
        val_bleu = evaluate_sacrebleu(seq2seq, val_loader, trg_itos=id_itos, beam_size=1)
        
        history["train_loss"].append(train_loss)
        history["val_loss"].append(val_loss)
        history["train_ppl"].append(train_ppl)
        history["val_ppl"].append(val_ppl)
        history["val_bleu"].append(val_bleu)

        print(f"Epoch {epoch:02d} | TF={tf:.2f} | Train Loss {train_loss:.4f} PPL {train_ppl:.2f} | Val Loss {val_loss:.4f} PPL {val_ppl:.2f} | Val Bleu {val_bleu:.4f} ")

        if val_bleu > best_val_bleu: # Ubah kriteria penyimpanan
            best_val_bleu = val_bleu
            best_epoch = epoch
            torch.save(seq2seq.state_dict(), args.checkpoint)
            print(f"Saving best model at epoch {best_epoch} with BLEU {best_val_bleu:.4f}")

    seq2seq.load_state_dict(torch.load(args.checkpoint, map_location=device))
    test_loss, test_ppl = epoch_run(seq2seq, test_loader, criterion, optimizer, train=False, teacher_forcing=0.0)
    
    test_bleu = evaluate_sacrebleu(seq2seq, test_loader, trg_itos=id_itos, beam_size=3)
    
    print(f"\n--- Experiment Results ---")
    print(f"Parameters: Dropout={args.dropout}, Hidden Size={args.encoder_hidden_size}/{args.decoder_hidden_size}")
    print(f"Best Val BLEU: {best_val_bleu:.4f} at Epoch {best_epoch}")
    print(f"TEST | Loss {test_loss:.4f} | PPL {test_ppl:.2f} | SacreBLEU {test_bleu:.2f}")
    
    # Simpan hasil ke file CSV untuk analisis
    with open("ablation_study_results.csv", "a", newline="") as f:
        writer = csv.writer(f)
        if f.tell() == 0:
            writer.writerow(["Experiment", "Dropout", "Enc Hidden", "Dec Hidden", "Best Val BLEU", "Test BLEU"])
        writer.writerow([f"Exp {args.exp_id}", args.dropout, args.encoder_hidden_size, args.decoder_hidden_size, best_val_bleu, test_bleu])

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_path', type=str, default='/kaggle/input/translate5/ind-eng/ind.txt', help='Path to txt data')
    parser.add_argument('--epochs', type=int, default=10, help='Number of training epochs')
    parser.add_argument('--batch_size', type=int, default=32, help='Batch size')
    parser.add_argument('--lr', type=float, default=1e-3, help='Learning rate')
    parser.add_argument('--max_vocab', type=int, default=None)
    parser.add_argument('--target_lang', type=str, default='ID', help='Bahasa tujuan')
    parser.add_argument('--checkpoint', type=str, default='bahdanau_best.pt', help='Path to save model checkpoint')
    
    # Tambahkan argumen untuk ablation study
    parser.add_argument('--dropout', type=float, default=0.15)
    parser.add_argument('--encoder_hidden_size', type=int, default=512)
    parser.add_argument('--decoder_hidden_size', type=int, default=256)
    parser.add_argument('--encoder_embedding_dim', type=int, default=256)
    parser.add_argument('--decoder_embedding_dim', type=int, default=256)
    parser.add_argument('--exp_id', type=int, default=0)

    # Parsing argumen untuk data preparation
    args, unknown = parser.parse_known_args()
    
    data_file = Path(args.data_path)
    pairs = load_pairs(data_file, max_len=20, max_pairs=None)
    train_pairs, val_pairs, test_pairs = split_pairs(pairs, 0.8, 0.1)
    
    en_vocab, en_itos = build_vocab([src for src, _ in train_pairs], max_size=args.max_vocab)
    id_vocab, id_itos = build_vocab([tgt for _, tgt in train_pairs], max_size=args.max_vocab)
    
    train_ds = NMTDataset(train_pairs, en_vocab, id_vocab)
    val_ds = NMTDataset(val_pairs, en_vocab, id_vocab)
    test_ds = NMTDataset(test_pairs, en_vocab, id_vocab)
    
    train_loader = DataLoader(train_ds, batch_size=args.batch_size, shuffle=True, collate_fn=collate_batch)
    val_loader = DataLoader(val_ds, batch_size=args.batch_size, shuffle=False, collate_fn=collate_batch)
    test_loader = DataLoader(test_ds, batch_size=args.batch_size, shuffle=False, collate_fn=collate_batch)

    # --- Jalankan Eksperimen Ablasi ---
    # Eksperimen 1: Baseline
    args.exp_id = 1
    args.dropout = 0.15
    args.encoder_hidden_size = 512
    args.decoder_hidden_size = 256
    args.checkpoint = 'bahdanau_baseline.pt'
    run_experiment(args, en_vocab, id_vocab, train_loader, val_loader, test_loader, en_itos, id_itos)
    
    # Eksperimen 2: Mengubah Dropout
    args.exp_id = 2
    args.dropout = 0.3
    args.encoder_hidden_size = 512
    args.decoder_hidden_size = 256
    args.checkpoint = 'bahdanau_dropout.pt'
    run_experiment(args, en_vocab, id_vocab, train_loader, val_loader, test_loader, en_itos, id_itos)
    
    # Eksperimen 3: Mengubah Hidden Size
    args.exp_id = 3
    args.dropout = 0.15
    args.encoder_hidden_size = 256
    args.decoder_hidden_size = 128
    args.checkpoint = 'bahdanau_hidden.pt'
    run_experiment(args, en_vocab, id_vocab, train_loader, val_loader, test_loader, en_itos, id_itos)

if __name__ == "__main__":
    main()


--- Running Experiment: Dropout=0.15, Hidden Size=512/256 ---


100%|██████████| 372/372 [00:11<00:00, 32.84it/s]
100%|██████████| 47/47 [00:00<00:00, 78.37it/s]
100%|██████████| 47/47 [00:01<00:00, 30.24it/s]


Epoch 01 | TF=0.70 | Train Loss 4.4536 PPL 85.94 | Val Loss 5.1197 PPL 167.29 | Val Bleu 33.4370 
Saving best model at epoch 1 with BLEU 33.4370


100%|██████████| 372/372 [00:11<00:00, 32.84it/s]
100%|██████████| 47/47 [00:00<00:00, 79.20it/s]
100%|██████████| 47/47 [00:01<00:00, 31.09it/s]


Epoch 02 | TF=0.66 | Train Loss 2.2406 PPL 9.40 | Val Loss 4.9918 PPL 147.21 | Val Bleu 45.1801 
Saving best model at epoch 2 with BLEU 45.1801


100%|██████████| 372/372 [00:11<00:00, 33.31it/s]
100%|██████████| 47/47 [00:00<00:00, 78.87it/s]
100%|██████████| 47/47 [00:01<00:00, 30.90it/s]


Epoch 03 | TF=0.62 | Train Loss 1.2286 PPL 3.42 | Val Loss 5.1554 PPL 173.36 | Val Bleu 37.9918 


100%|██████████| 372/372 [00:11<00:00, 33.30it/s]
100%|██████████| 47/47 [00:00<00:00, 80.26it/s]
100%|██████████| 47/47 [00:01<00:00, 29.63it/s]


Epoch 04 | TF=0.58 | Train Loss 0.9097 PPL 2.48 | Val Loss 5.3033 PPL 201.01 | Val Bleu 25.4066 


100%|██████████| 372/372 [00:11<00:00, 33.13it/s]
100%|██████████| 47/47 [00:00<00:00, 80.11it/s]
100%|██████████| 47/47 [00:01<00:00, 31.07it/s]


Epoch 05 | TF=0.54 | Train Loss 0.7400 PPL 2.10 | Val Loss 5.5444 PPL 255.79 | Val Bleu 63.8943 
Saving best model at epoch 5 with BLEU 63.8943


100%|██████████| 372/372 [00:11<00:00, 32.93it/s]
100%|██████████| 47/47 [00:00<00:00, 78.95it/s]
100%|██████████| 47/47 [00:01<00:00, 30.37it/s]


Epoch 06 | TF=0.50 | Train Loss 0.6263 PPL 1.87 | Val Loss 5.6392 PPL 281.23 | Val Bleu 34.3295 


100%|██████████| 372/372 [00:11<00:00, 32.81it/s]
100%|██████████| 47/47 [00:00<00:00, 75.86it/s]
100%|██████████| 47/47 [00:01<00:00, 30.89it/s]


Epoch 07 | TF=0.46 | Train Loss 0.5463 PPL 1.73 | Val Loss 5.7780 PPL 323.13 | Val Bleu 39.7635 


100%|██████████| 372/372 [00:11<00:00, 32.99it/s]
100%|██████████| 47/47 [00:00<00:00, 78.42it/s]
100%|██████████| 47/47 [00:01<00:00, 30.77it/s]


Epoch 08 | TF=0.42 | Train Loss 0.5061 PPL 1.66 | Val Loss 5.8380 PPL 343.09 | Val Bleu 45.1801 


100%|██████████| 372/372 [00:11<00:00, 33.26it/s]
100%|██████████| 47/47 [00:00<00:00, 78.51it/s]
100%|██████████| 47/47 [00:01<00:00, 31.23it/s]


Epoch 09 | TF=0.38 | Train Loss 0.4597 PPL 1.58 | Val Loss 5.9303 PPL 376.29 | Val Bleu 45.1801 


100%|██████████| 372/372 [00:11<00:00, 32.68it/s]
100%|██████████| 47/47 [00:00<00:00, 80.23it/s]
100%|██████████| 47/47 [00:01<00:00, 30.75it/s]


Epoch 10 | TF=0.34 | Train Loss 0.4327 PPL 1.54 | Val Loss 6.0314 PPL 416.31 | Val Bleu 34.5721 


100%|██████████| 47/47 [00:00<00:00, 67.18it/s]
100%|██████████| 47/47 [00:01<00:00, 40.77it/s]



--- Experiment Results ---
Parameters: Dropout=0.15, Hidden Size=512/256
Best Val BLEU: 63.8943 at Epoch 5
TEST | Loss 7.3285 | PPL 1523.15 | SacreBLEU 48.89

--- Running Experiment: Dropout=0.3, Hidden Size=512/256 ---


100%|██████████| 372/372 [00:11<00:00, 33.28it/s]
100%|██████████| 47/47 [00:00<00:00, 80.24it/s]
100%|██████████| 47/47 [00:01<00:00, 30.82it/s]


Epoch 01 | TF=0.70 | Train Loss 4.5630 PPL 95.87 | Val Loss 5.0254 PPL 152.23 | Val Bleu 42.7287 
Saving best model at epoch 1 with BLEU 42.7287


100%|██████████| 372/372 [00:11<00:00, 33.06it/s]
100%|██████████| 47/47 [00:00<00:00, 79.15it/s]
100%|██████████| 47/47 [00:01<00:00, 29.99it/s]


Epoch 02 | TF=0.66 | Train Loss 2.5084 PPL 12.29 | Val Loss 4.8358 PPL 125.93 | Val Bleu 34.3295 


100%|██████████| 372/372 [00:11<00:00, 33.10it/s]
100%|██████████| 47/47 [00:00<00:00, 79.35it/s]
100%|██████████| 47/47 [00:01<00:00, 30.79it/s]


Epoch 03 | TF=0.62 | Train Loss 1.4805 PPL 4.40 | Val Loss 4.9500 PPL 141.17 | Val Bleu 56.2341 
Saving best model at epoch 3 with BLEU 56.2341


100%|██████████| 372/372 [00:11<00:00, 32.79it/s]
100%|██████████| 47/47 [00:00<00:00, 80.41it/s]
100%|██████████| 47/47 [00:01<00:00, 30.60it/s]


Epoch 04 | TF=0.58 | Train Loss 1.0413 PPL 2.83 | Val Loss 5.0715 PPL 159.41 | Val Bleu 29.0715 


100%|██████████| 372/372 [00:11<00:00, 33.21it/s]
100%|██████████| 47/47 [00:00<00:00, 76.79it/s]
100%|██████████| 47/47 [00:01<00:00, 30.68it/s]


Epoch 05 | TF=0.54 | Train Loss 0.8731 PPL 2.39 | Val Loss 5.3321 PPL 206.86 | Val Bleu 50.0000 


100%|██████████| 372/372 [00:11<00:00, 32.95it/s]
100%|██████████| 47/47 [00:00<00:00, 79.37it/s]
100%|██████████| 47/47 [00:01<00:00, 30.81it/s]


Epoch 06 | TF=0.50 | Train Loss 0.7589 PPL 2.14 | Val Loss 5.4235 PPL 226.68 | Val Bleu 39.7635 


100%|██████████| 372/372 [00:11<00:00, 33.38it/s]
100%|██████████| 47/47 [00:00<00:00, 81.41it/s]
100%|██████████| 47/47 [00:01<00:00, 30.86it/s]


Epoch 07 | TF=0.46 | Train Loss 0.6618 PPL 1.94 | Val Loss 5.6380 PPL 280.90 | Val Bleu 35.9304 


100%|██████████| 372/372 [00:11<00:00, 32.82it/s]
100%|██████████| 47/47 [00:00<00:00, 79.73it/s]
100%|██████████| 47/47 [00:01<00:00, 29.80it/s]


Epoch 08 | TF=0.42 | Train Loss 0.5908 PPL 1.81 | Val Loss 5.7655 PPL 319.11 | Val Bleu 29.0715 


100%|██████████| 372/372 [00:11<00:00, 33.04it/s]
100%|██████████| 47/47 [00:00<00:00, 78.35it/s]
100%|██████████| 47/47 [00:01<00:00, 30.82it/s]


Epoch 09 | TF=0.38 | Train Loss 0.5545 PPL 1.74 | Val Loss 5.7574 PPL 316.52 | Val Bleu 37.1501 


100%|██████████| 372/372 [00:11<00:00, 33.21it/s]
100%|██████████| 47/47 [00:00<00:00, 80.71it/s]
100%|██████████| 47/47 [00:01<00:00, 30.85it/s]


Epoch 10 | TF=0.34 | Train Loss 0.5383 PPL 1.71 | Val Loss 5.9943 PPL 401.12 | Val Bleu 33.4370 


100%|██████████| 47/47 [00:00<00:00, 62.59it/s]
100%|██████████| 47/47 [00:01<00:00, 42.18it/s]



--- Experiment Results ---
Parameters: Dropout=0.3, Hidden Size=512/256
Best Val BLEU: 56.2341 at Epoch 3
TEST | Loss 6.5165 | PPL 676.19 | SacreBLEU 33.03

--- Running Experiment: Dropout=0.15, Hidden Size=256/128 ---


100%|██████████| 372/372 [00:10<00:00, 34.61it/s]
100%|██████████| 47/47 [00:00<00:00, 82.47it/s]
100%|██████████| 47/47 [00:01<00:00, 31.66it/s]


Epoch 01 | TF=0.70 | Train Loss 4.7220 PPL 112.40 | Val Loss 5.2293 PPL 186.67 | Val Bleu 42.7287 
Saving best model at epoch 1 with BLEU 42.7287


100%|██████████| 372/372 [00:10<00:00, 35.05it/s]
100%|██████████| 47/47 [00:00<00:00, 82.70it/s]
100%|██████████| 47/47 [00:01<00:00, 32.48it/s]


Epoch 02 | TF=0.66 | Train Loss 2.6409 PPL 14.03 | Val Loss 4.8860 PPL 132.43 | Val Bleu 45.1801 
Saving best model at epoch 2 with BLEU 45.1801


100%|██████████| 372/372 [00:10<00:00, 34.75it/s]
100%|██████████| 47/47 [00:00<00:00, 83.09it/s]
100%|██████████| 47/47 [00:01<00:00, 31.60it/s]


Epoch 03 | TF=0.62 | Train Loss 1.5869 PPL 4.89 | Val Loss 4.8764 PPL 131.16 | Val Bleu 33.4370 


100%|██████████| 372/372 [00:10<00:00, 35.04it/s]
100%|██████████| 47/47 [00:00<00:00, 84.35it/s]
100%|██████████| 47/47 [00:01<00:00, 32.96it/s]


Epoch 04 | TF=0.58 | Train Loss 1.0479 PPL 2.85 | Val Loss 5.0464 PPL 155.46 | Val Bleu 45.1801 


100%|██████████| 372/372 [00:10<00:00, 35.04it/s]
100%|██████████| 47/47 [00:00<00:00, 82.32it/s]
100%|██████████| 47/47 [00:01<00:00, 32.49it/s]


Epoch 05 | TF=0.54 | Train Loss 0.7856 PPL 2.19 | Val Loss 5.1161 PPL 166.69 | Val Bleu 34.3295 


100%|██████████| 372/372 [00:10<00:00, 35.55it/s]
100%|██████████| 47/47 [00:00<00:00, 85.74it/s]
100%|██████████| 47/47 [00:01<00:00, 32.84it/s]


Epoch 06 | TF=0.50 | Train Loss 0.6478 PPL 1.91 | Val Loss 5.3200 PPL 204.38 | Val Bleu 33.4370 


100%|██████████| 372/372 [00:10<00:00, 34.75it/s]
100%|██████████| 47/47 [00:00<00:00, 84.44it/s]
100%|██████████| 47/47 [00:01<00:00, 31.69it/s]


Epoch 07 | TF=0.46 | Train Loss 0.5647 PPL 1.76 | Val Loss 5.4002 PPL 221.44 | Val Bleu 45.1801 


100%|██████████| 372/372 [00:10<00:00, 35.12it/s]
100%|██████████| 47/47 [00:00<00:00, 83.25it/s]
100%|██████████| 47/47 [00:01<00:00, 32.57it/s]


Epoch 08 | TF=0.42 | Train Loss 0.4964 PPL 1.64 | Val Loss 5.4534 PPL 233.55 | Val Bleu 35.9304 


100%|██████████| 372/372 [00:10<00:00, 34.99it/s]
100%|██████████| 47/47 [00:00<00:00, 83.16it/s]
100%|██████████| 47/47 [00:01<00:00, 32.30it/s]


Epoch 09 | TF=0.38 | Train Loss 0.4427 PPL 1.56 | Val Loss 5.6939 PPL 297.06 | Val Bleu 34.3295 


100%|██████████| 372/372 [00:10<00:00, 35.34it/s]
100%|██████████| 47/47 [00:00<00:00, 82.21it/s]
100%|██████████| 47/47 [00:01<00:00, 32.66it/s]


Epoch 10 | TF=0.34 | Train Loss 0.4076 PPL 1.50 | Val Loss 5.6355 PPL 280.20 | Val Bleu 30.7394 


100%|██████████| 47/47 [00:00<00:00, 70.00it/s]
100%|██████████| 47/47 [00:01<00:00, 38.58it/s]


--- Experiment Results ---
Parameters: Dropout=0.15, Hidden Size=256/128
Best Val BLEU: 45.1801 at Epoch 2
TEST | Loss 6.1900 | PPL 487.86 | SacreBLEU 21.36



