# 1. Chuẩn bị dữ liệu

# Cài đặt
- pip install spacy
- python -m spacy download en_core_web_sm
- python -m spacy download de_core_news_sm




# 2. Tokenization – dùng Spacy

In [22]:
import spacy

# English tokenizer
spacy_en = spacy.load("en_core_web_sm")
def tokenizer_en(text):
    return spacy_en.tokenizer(text)

# German tokenizer
spacy_de = spacy.load("de_core_news_sm")
def tokenizer_de(text):
    return spacy_de.tokenizer(text)


# 3.Load EN–DE từ file .gz

In [23]:
import gzip

def load_parallel_corpus(en_file, de_file):
    sentences_en = []
    sentences_de = []

    with gzip.open(en_file, 'rt', encoding='utf-8') as f_en, \
         gzip.open(de_file, 'rt', encoding='utf-8') as f_de:

        for en_line, de_line in zip(f_en, f_de):
            en = en_line.strip()
            de = de_line.strip()
            sentences_en.append(en)
            sentences_de.append(de)

    return sentences_en, sentences_de


# 3.1 Load train / val

In [24]:
train_en, train_de = load_parallel_corpus("train.en.gz", "train.de.gz")
val_en, val_de = load_parallel_corpus("val.en.gz", "val.de.gz")
# Kiểm tra đã load được chưa
print(train_en[0])
print(train_de[0])


Two young, White males are outside near many bushes.
Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.


# 4. Xây dựng từ điển(Vocabulary)

In [25]:
from collections import Counter

special_tokens = ["<unk>", "<pad>", "<sos>", "<eos>"]

def build_vocab(sentences, tokenizer, max_words=10000):
    counter = Counter()
    for sent in sentences:
        tokens = [t.text.lower() for t in tokenizer(sent)]
        counter.update(tokens)

    # Chọn 10000 từ phổ biến nhất
    most_common = counter.most_common(max_words - len(special_tokens))

    vocab = special_tokens + [w for w, _ in most_common]
    stoi = {w: i for i, w in enumerate(vocab)}

    return vocab, stoi


# 4.1 Build vocab EN & DE

In [26]:
vocab_en, stoi_en = build_vocab(train_en, tokenizer_en)
vocab_de, stoi_de = build_vocab(train_de, tokenizer_de)

print("Vocabulary EN:", len(vocab_en))
print("Vocabulary DE:", len(vocab_de))


Vocabulary EN: 9797
Vocabulary DE: 10000


# 5. Hàm convert câu → id + thêm <sos> <eos>

In [27]:
def numericalize(sentence, tokenizer, stoi):
    tokens = ["<sos>"] + [t.text.lower() for t in tokenizer(sentence)] + ["<eos>"]
    return [stoi.get(tok, stoi["<unk>"]) for tok in tokens]


# 5.1 Tạo dataset dạng list of (tensor_en, tensor_de)

In [28]:
import torch

def make_dataset(en_sentences, de_sentences, tokenizer_en, tokenizer_de, stoi_en, stoi_de):
    data = []
    for en, de in zip(en_sentences, de_sentences):
        en_ids = torch.tensor(numericalize(en, tokenizer_en, stoi_en))
        de_ids = torch.tensor(numericalize(de, tokenizer_de, stoi_de))
        data.append((en_ids, de_ids))
    return data

train_dataset = make_dataset(train_en, train_de, tokenizer_en, tokenizer_de, stoi_en, stoi_de)
val_dataset   = make_dataset(val_en, val_de, tokenizer_en, tokenizer_de, stoi_en, stoi_de)


# 5.2 collate_fn (chuẩn cho padding + packing)

In [29]:
from torch.nn.utils.rnn import pad_sequence

PAD_IDX_EN = stoi_en["<pad>"]
PAD_IDX_DE = stoi_de["<pad>"]

def collate_fn(batch):
    # batch = [(en_ids, de_ids), ...]
    en_list = [item[0] for item in batch]
    de_list = [item[1] for item in batch]

    # Lấy độ dài gốc
    en_lengths = torch.tensor([len(x) for x in en_list])
    de_lengths = torch.tensor([len(x) for x in de_list])

    # Sắp xếp theo độ dài giảm dần (required for pack_padded_sequence)
    en_lengths, sort_idx = en_lengths.sort(descending=True)
    en_list = [en_list[i] for i in sort_idx]
    de_list = [de_list[i] for i in sort_idx]
    de_lengths = de_lengths[sort_idx]

    # Padding
    en_padded = pad_sequence(en_list, batch_first=True, padding_value=PAD_IDX_EN)
    de_padded = pad_sequence(de_list, batch_first=True, padding_value=PAD_IDX_DE)

    return en_padded, en_lengths, de_padded, de_lengths


# 6. DataLoader

In [30]:
from torch.utils.data import DataLoader

train_loader = DataLoader(
    train_dataset,
    batch_size=64,
    shuffle=True,
    collate_fn=collate_fn
)

val_loader = DataLoader(
    val_dataset,
    batch_size=64,
    shuffle=False,
    collate_fn=collate_fn
)


Cách dùng trong LSTM Encoder

In [31]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

def forward(self, src, src_lengths):
    # src shape: (batch, seq_len)
    embedded = self.embedding(src)

    packed = pack_padded_sequence(
        embedded,
        src_lengths.cpu(),
        batch_first=True,
        enforce_sorted=True
    )

    outputs, hidden = self.lstm(packed)

    outputs, _ = pad_packed_sequence(outputs, batch_first=True)

    return outputs, hidden


# 7. Xây dựng mô hình

## 7.1 Encoder

In [42]:
import torch
import torch.nn as nn

class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_dim=512, hidden_size=512, num_layers=2, dropout=0.3):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=stoi_en["<pad>"])
        self.lstm = nn.LSTM(
            embed_dim,
            hidden_size,
            num_layers=num_layers,
            dropout=dropout,
            batch_first=True
        )

    def forward(self, src, src_lengths):
        # src: (batch, seq_len)
        embedded = self.embedding(src)  # (B, L, E)

        packed = nn.utils.rnn.pack_padded_sequence(
            embedded, src_lengths.cpu(), batch_first=True, enforce_sorted=True
        )

        outputs, (h_n, c_n) = self.lstm(packed)

        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True)

        return outputs, (h_n, c_n)


## 7.2 Decoder

In [39]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_dim=512, hidden_size=512, num_layers=2, dropout=0.3):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=stoi_de["<pad>"])

        self.lstm = nn.LSTM(
            embed_dim,
            hidden_size,
            num_layers=num_layers,
            dropout=dropout,
            batch_first=True
        )

        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, input_token, hidden):
        # input_token: (batch,) 1 token tại bước t
        # hidden = (h, c)

        embedded = self.embedding(input_token).unsqueeze(1)  # (B,1,E)

        output, hidden = self.lstm(embedded, hidden)  # output: (B,1,H)

        logits = self.fc(output.squeeze(1))  # (B, vocab)

        return logits, hidden


## 7.3 Seq2Seq Model

In [40]:
import random

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device, teacher_forcing_ratio=0.5):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        self.teacher_forcing_ratio = teacher_forcing_ratio

    def forward(self, src, src_lengths, trg):
        # src: (B, Ls)
        # trg: (B, Lt)
        batch_size, trg_len = trg.size()
        vocab_size = self.decoder.fc.out_features

        outputs = torch.zeros(batch_size, trg_len, vocab_size).to(self.device)

        # ---- Encoder ----
        _, hidden = self.encoder(src, src_lengths)

        # token đầu tiên cho decoder = <sos>
        input_token = trg[:, 0]

        for t in range(1, trg_len):
            logits, hidden = self.decoder(input_token, hidden)
            outputs[:, t] = logits

            # chọn token dự đoán
            predicted = logits.argmax(dim=1)

            # teacher forcing ?
            if random.random() < self.teacher_forcing_ratio:
                input_token = trg[:, t]     # dùng ground truth
            else:
                input_token = predicted     # dùng dự đoán

        return outputs


## 7.4 Khởi tạo mô hình

In [41]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

encoder = Encoder(
    vocab_size=len(vocab_en),
    embed_dim=512,
    hidden_size=512,
    num_layers=2,
    dropout=0.3
)

decoder = Decoder(
    vocab_size=len(vocab_de),
    embed_dim=512,
    hidden_size=512,
    num_layers=2,
    dropout=0.3
)

model = Seq2Seq(encoder, decoder, device).to(device)


# 8. Huấn luyện mô hình

In [47]:

import time
import random
import torch
import torch.nn as nn
import torch.optim as optim

# Config
LR = 0.001
NUM_EPOCHS = 10       # bạn có thể đặt 10-20
PATIENCE = 3            # early stopping nếu val_loss không giảm sau 3 epoch
CLIP = 1.0              # grad clipping
USE_SCHEDULER = True    # nếu muốn dùng ReduceLROnPlateau

# Loss & Optimizer
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX_DE)
optimizer = optim.Adam(model.parameters(), lr=LR)

# Optional scheduler
try:
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='min', factor=0.5, patience=1, verbose=True
    )
except TypeError:
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='min', factor=0.5, patience=1
    )
# Helper: evaluation on validation set (no teacher forcing)
def evaluate(model, val_loader, criterion, device):
    model.eval()
    # Turn off teacher forcing during validation (full autoregressive)
    prev_tf = getattr(model, "teacher_forcing_ratio", 0.0)
    model.teacher_forcing_ratio = 0.0

    total_loss = 0.0
    n_batches = 0
    with torch.no_grad():
        for src, src_lengths, trg, trg_lengths in val_loader:
            src = src.to(device)
            src_lengths = src_lengths.to(device)
            trg = trg.to(device)

            outputs = model(src, src_lengths, trg)  # (B, T, V)
            vocab_size = outputs.size(-1)

            # ignore the first token (<sos>) when computing loss
            pred = outputs[:, 1:, :].contiguous().view(-1, vocab_size)   # (B*(T-1), V)
            target = trg[:, 1:].contiguous().view(-1)                    # (B*(T-1))

            loss = criterion(pred, target)
            total_loss += loss.item()
            n_batches += 1

    model.teacher_forcing_ratio = prev_tf
    return total_loss / (n_batches if n_batches > 0 else 1)

# Training loop
best_val_loss = float('inf')
epochs_no_improve = 0
history = {"train_loss": [], "val_loss": []}

for epoch in range(1, NUM_EPOCHS + 1):
    start_time = time.time()
    model.train()
    train_loss = 0.0
    n_batches = 0

    for src, src_lengths, trg, trg_lengths in train_loader:
        src = src.to(device)
        src_lengths = src_lengths.to(device)
        trg = trg.to(device)

        optimizer.zero_grad()
        outputs = model(src, src_lengths, trg)  # (B, T, V)
        vocab_size = outputs.size(-1)

        # shift: ignore <sos> token in loss
        pred = outputs[:, 1:, :].contiguous().view(-1, vocab_size)  # (B*(T-1), V)
        target = trg[:, 1:].contiguous().view(-1)                   # (B*(T-1))

        loss = criterion(pred, target)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP)
        optimizer.step()

        train_loss += loss.item()
        n_batches += 1

    avg_train_loss = train_loss / (n_batches if n_batches > 0 else 1)
    avg_val_loss = evaluate(model, val_loader, criterion, device)

    history["train_loss"].append(avg_train_loss)
    history["val_loss"].append(avg_val_loss)

    # Scheduler step on validation loss
    if USE_SCHEDULER:
        scheduler.step(avg_val_loss)

    # Save best model
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), "best_model.pt")
        epochs_no_improve = 0
        best_note = " (best -> saved)"
    else:
        epochs_no_improve += 1
        best_note = ""

    elapsed = time.time() - start_time
    print(f"Epoch {epoch:02d} | Train loss: {avg_train_loss:.4f} | Val loss: {avg_val_loss:.4f}{best_note} | Time: {elapsed:.1f}s")

    # Early stopping
    if epochs_no_improve >= PATIENCE:
        print(f"Early stopping triggered. No improvement for {PATIENCE} epochs.")
        break

print("Training finished. Best val loss: {:.4f}".format(best_val_loss))

Epoch 01 | Train loss: 3.1716 | Val loss: 3.7108 (best -> saved) | Time: 1261.8s
Epoch 02 | Train loss: 2.8115 | Val loss: 3.6060 (best -> saved) | Time: 1360.9s
Epoch 03 | Train loss: 2.4953 | Val loss: 3.5828 (best -> saved) | Time: 1210.6s
Epoch 04 | Train loss: 2.2616 | Val loss: 3.6161 | Time: 1089.8s
Epoch 05 | Train loss: 2.0136 | Val loss: 3.5954 | Time: 1060.6s
Epoch 06 | Train loss: 1.7007 | Val loss: 3.6314 | Time: 1070.0s
Early stopping triggered. No improvement for 3 epochs.
Training finished. Best val loss: 3.5828


# 9. Dự đoán (Inference)

In [48]:

# Helper: Build reverse vocab (id -> token)
def build_itos(vocab):
    """Index to string mapping"""
    return {i: w for i, w in enumerate(vocab)}

itos_de = build_itos(vocab_de)

# Helper: Detokenize German sentence
def detokenize_de(tokens):
    """
    Ghép tokens lại thành câu (detokenize)
    Đơn giản: join với space, sau đó xử lý dấu câu và contractions
    """
    text = " ".join(tokens)
    # Xóa space trước dấu câu
    text = text.replace(" .", ".").replace(" ,", ",").replace(" !", "!").replace(" ?", "?")
    return text.strip()

def translate(sentence: str, model, device, tokenizer_en, stoi_en, itos_de, stoi_de, 
              max_length=50, beam_width=1) -> str:
    """
    Dịch câu tiếng Anh sang tiếng Đức (Greedy Decoding).
    
    Args:
        sentence: Input English sentence
        model: Seq2Seq model
        device: torch device (cpu/cuda)
        tokenizer_en: Spacy English tokenizer
        stoi_en: English string-to-index vocab
        itos_de: German index-to-string vocab
        max_length: Maximum output length
        beam_width: 1 for greedy, >1 for beam search (optional)
    
    Returns:
        Translated German sentence as string
    """
    model.eval()
    
    # ---- 1. Tokenize + Numericalize input (English) ----
    tokens_en = [t.text.lower() for t in tokenizer_en(sentence)]
    input_ids = [stoi_en.get("<sos>", 1)] + [stoi_en.get(tok, stoi_en["<unk>"]) for tok in tokens_en] + [stoi_en.get("<eos>", 3)]
    src_tensor = torch.tensor(input_ids, dtype=torch.long).unsqueeze(0).to(device)  # (1, seq_len)
    src_length = torch.tensor([len(input_ids)], dtype=torch.long).to(device)  # (1,)
    
    with torch.no_grad():
        # ---- 2. Encoder ----
        _, hidden = model.encoder(src_tensor, src_length)
        
        # ---- 3. Decoder (Greedy) ----
        output_ids = [stoi_de["<sos>"]]  # Start with <sos>
        input_token = torch.tensor([stoi_de["<sos>"]], dtype=torch.long).to(device)  # (1,)
        
        for t in range(1, max_length):
            logits, hidden = model.decoder(input_token, hidden)  # (1, vocab_size)
            
            # Greedy: chọn token có xác suất cao nhất
            predicted_id = logits.argmax(dim=1).item()  # scalar
            output_ids.append(predicted_id)
            
            # Nếu gặp <eos>, dừng
            if predicted_id == stoi_de["<eos>"]:
                break
            
            # input cho bước tiếp theo
            input_token = torch.tensor([predicted_id], dtype=torch.long).to(device)
    
    # ---- 4. Detokenize: convert id → token → text ----
    # Bỏ <sos> và <eos>
    output_tokens = [itos_de.get(idx, "<unk>") for idx in output_ids[1:]]
    if output_tokens and output_tokens[-1] == "<eos>":
        output_tokens = output_tokens[:-1]
    
    translated_sentence = detokenize_de(output_tokens)
    
    return translated_sentence


# ---- Test examples ----
test_sentences = [
    "Hello, how are you?",
    "What is your name?",
    "The weather is nice today."
]

print("=" * 60)
print("INFERENCE EXAMPLES (Greedy Decoding)")
print("=" * 60)

for en_sent in test_sentences:
    de_sent = translate(en_sent, model, device, tokenizer_en, stoi_en, itos_de, stoi_de)
    print(f"EN: {en_sent}")
    print(f"DE: {de_sent}")
    print()

INFERENCE EXAMPLES (Greedy Decoding)
EN: Hello, how are you?
DE: <unk> <unk> feiern.

EN: What is your name?
DE: ein künstler trägt eine <unk>.

EN: The weather is nice today.
DE: das <unk> ist eine <unk>.



# 10. Đánh giá

In [49]:

from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
from nltk.tokenize import word_tokenize
import math
import numpy as np

# Load test set (hoặc dùng tập val nếu không có test riêng)
# test_en, test_de = load_parallel_corpus("test.en.gz", "test.de.gz")
# Tạm dùng val set để demo
test_en, test_de = val_en[:200], val_de[:200]  # Lấy 200 câu từ val set

print(f"Evaluating on {len(test_en)} test sentences")

# ========== 1. Tính BLEU Score ==========

def compute_bleu_score(references, hypotheses):
    """
    Tính BLEU score trung bình trên corpus
    
    Args:
        references: list of list of reference sentences (tokens)
        hypotheses: list of hypothesis sentences (tokens)
    
    Returns:
        bleu_score (0-1)
    """
    total_bleu = 0.0
    n = len(hypotheses)
    
    bleu_scores = []
    for ref, hyp in zip(references, hypotheses):
        # sentence_bleu expects: reference (list of list), hypothesis (list)
        ref_tokens = ref.split()
        hyp_tokens = hyp.split()
        
        # weights for 1-gram, 2-gram, 3-gram, 4-gram
        weights = (0.25, 0.25, 0.25, 0.25)
        bleu = sentence_bleu([ref_tokens], hyp_tokens, weights=weights)
        bleu_scores.append(bleu)
        total_bleu += bleu
    
    avg_bleu = total_bleu / n
    return avg_bleu, bleu_scores


# ========== 2. Tính Perplexity ==========

def compute_perplexity(model, test_loader, criterion, device):
    """
    Tính Perplexity trên test set
    Perplexity = exp(loss)
    """
    model.eval()
    total_loss = 0.0
    n_tokens = 0
    
    with torch.no_grad():
        for src, src_lengths, trg, trg_lengths in test_loader:
            src = src.to(device)
            src_lengths = src_lengths.to(device)
            trg = trg.to(device)
            
            outputs = model(src, src_lengths, trg)
            vocab_size = outputs.size(-1)
            
            pred = outputs[:, 1:, :].contiguous().view(-1, vocab_size)
            target = trg[:, 1:].contiguous().view(-1)
            
            loss = criterion(pred, target)
            total_loss += loss.item() * target.size(0)
            n_tokens += (target != PAD_IDX_DE).sum().item()
    
    avg_loss = total_loss / n_tokens
    perplexity = math.exp(avg_loss)
    
    return perplexity, avg_loss


# ========== 3. Tạo Test DataLoader ==========

test_dataset = make_dataset(test_en, test_de, tokenizer_en, tokenizer_de, stoi_en, stoi_de)

test_loader = DataLoader(
    test_dataset,
    batch_size=64,
    shuffle=False,
    collate_fn=collate_fn
)


# ========== 4. Dịch toàn bộ test set ==========

print("\nTranslating test set...")
predictions = []
for en_sent in test_en:
    de_pred = translate(en_sent, model, device, tokenizer_en, stoi_en, itos_de, stoi_de)
    predictions.append(de_pred)

print(f"Translated {len(predictions)} sentences")


# ========== 5. Tính BLEU & Perplexity ==========

print("\n" + "="*70)
print("EVALUATION METRICS")
print("="*70)

# BLEU Score
avg_bleu, bleu_scores = compute_bleu_score(test_de, predictions)
print(f"\nBLEU Score (average): {avg_bleu:.4f}")

# Perplexity
perplexity, avg_loss = compute_perplexity(model, test_loader, criterion, device)
print(f"Perplexity: {perplexity:.4f}")
print(f"Average Loss: {avg_loss:.4f}")


# ========== 6. Error Analysis: 5 ví dụ đúng + sai ==========

print("\n" + "="*70)
print("DETAILED EXAMPLES & ERROR ANALYSIS")
print("="*70)

# Sắp xếp theo BLEU score để lấy ví dụ tốt nhất và xấu nhất
indices = np.argsort(bleu_scores)

# 5 ví dụ tốt nhất (highest BLEU)
print("\n--- TOP 5 BEST TRANSLATIONS (Highest BLEU) ---\n")
best_indices = indices[-5:][::-1]
for rank, idx in enumerate(best_indices, 1):
    en = test_en[idx]
    de_ref = test_de[idx]
    de_pred = predictions[idx]
    bleu = bleu_scores[idx]
    
    print(f"{rank}. BLEU: {bleu:.4f}")
    print(f"   EN:  {en}")
    print(f"   REF: {de_ref}")
    print(f"   PRED: {de_pred}")
    print()

# 5 ví dụ tệ nhất (lowest BLEU)
print("\n--- TOP 5 WORST TRANSLATIONS (Lowest BLEU) ---\n")
worst_indices = indices[:5]
for rank, idx in enumerate(worst_indices, 1):
    en = test_en[idx]
    de_ref = test_de[idx]
    de_pred = predictions[idx]
    bleu = bleu_scores[idx]
    
    print(f"{rank}. BLEU: {bleu:.4f}")
    print(f"   EN:  {en}")
    print(f"   REF: {de_ref}")
    print(f"   PRED: {de_pred}")
    
    # Phân tích lỗi
    ref_tokens = set(de_ref.split())
    pred_tokens = set(de_pred.split())
    
    missing = ref_tokens - pred_tokens
    extra = pred_tokens - ref_tokens
    
    if missing or extra:
        print(f"   ERROR ANALYSIS:")
        if missing:
            print(f"     - Missing words: {', '.join(list(missing)[:5])}")
        if extra:
            print(f"     - Extra words: {', '.join(list(extra)[:5])}")
    print()


# ========== 7. Thống kê BLEU Distribution ==========

print("\n" + "="*70)
print("BLEU SCORE DISTRIBUTION")
print("="*70)

bleu_array = np.array(bleu_scores)
print(f"\nMin BLEU:    {bleu_array.min():.4f}")
print(f"Max BLEU:    {bleu_array.max():.4f}")
print(f"Mean BLEU:   {bleu_array.mean():.4f}")
print(f"Median BLEU: {np.median(bleu_array):.4f}")
print(f"Std BLEU:    {bleu_array.std():.4f}")

# Phân loại theo BLEU ranges
bleu_ranges = {
    "0.0-0.2": (bleu_array >= 0.0) & (bleu_array < 0.2),
    "0.2-0.4": (bleu_array >= 0.2) & (bleu_array < 0.4),
    "0.4-0.6": (bleu_array >= 0.4) & (bleu_array < 0.6),
    "0.6-0.8": (bleu_array >= 0.6) & (bleu_array < 0.8),
    "0.8-1.0": (bleu_array >= 0.8) & (bleu_array <= 1.0),
}

print("\nBLEU Score Distribution by Range:")
for range_name, mask in bleu_ranges.items():
    count = mask.sum()
    pct = 100 * count / len(bleu_array)
    print(f"  {range_name}: {count:4d} ({pct:5.1f}%)")


# ========== 8. Common Error Patterns ==========

print("\n" + "="*70)
print("COMMON ERROR PATTERNS")
print("="*70)

error_patterns = {
    "length_mismatch": 0,
    "word_substitution": 0,
    "omission": 0,
    "insertion": 0,
}

for idx in range(len(test_de)):
    ref_tokens = test_de[idx].split()
    pred_tokens = predictions[idx].split()
    
    if len(pred_tokens) < len(ref_tokens) * 0.7:
        error_patterns["omission"] += 1
    elif len(pred_tokens) > len(ref_tokens) * 1.3:
        error_patterns["insertion"] += 1
    elif len(pred_tokens) != len(ref_tokens):
        error_patterns["length_mismatch"] += 1
    
    if ref_tokens != pred_tokens:
        # Check for word substitutions
        matching = sum(1 for r, p in zip(ref_tokens, pred_tokens) if r == p)
        if matching < len(ref_tokens):
            error_patterns["word_substitution"] += 1

print("\nError Pattern Frequencies (out of {} sentences):".format(len(test_de)))
for pattern, count in error_patterns.items():
    pct = 100 * count / len(test_de)
    print(f"  {pattern}: {count:4d} ({pct:5.1f}%)")

print("\n" + "="*70)

Evaluating on 200 test sentences

Translating test set...
Translated 200 sentences

EVALUATION METRICS

BLEU Score (average): 0.0054


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Perplexity: 676.4170
Average Loss: 6.5168

DETAILED EXAMPLES & ERROR ANALYSIS

--- TOP 5 BEST TRANSLATIONS (Highest BLEU) ---

1. BLEU: 0.3156
   EN:  A man playing a keyboard and singing into a microphone.
   REF: Eine Frau spielt Keyboard und singt in ein Mikrofon.
   PRED: ein mann spielt gitarre und singt in ein mikrofon.

2. BLEU: 0.3156
   EN:  A brown dog chewing on a large piece of wood.
   REF: Ein brauner Hund kaut auf einem großen Holzstück herum.
   PRED: ein brauner hund kaut auf einem großen stück holz.

3. BLEU: 0.2790
   EN:  A man sleeping in a green room on a couch.
   REF: Ein Mann schläft in einem grünen Raum auf einem Sofa.
   PRED: ein mann schläft in einem grünen zimmer auf einem grünen sofa.

4. BLEU: 0.1750
   EN:  A balding man wearing a red life jacket is sitting in a small boat.
   REF: Ein Mann mit beginnender Glatze, der eine rote Rettungsweste trägt, sitzt in einem kleinen Boot.
   PRED: ein mann mit einer roten jacke sitzt in einem kleinen boot auf einem

# 11. Xử lý các phần khó

In [50]:

print("\n" + "="*80)
print("TROUBLESHOOTING & DEBUGGING GUIDE")
print("="*80)

# ========== 1. Kiểm tra Shape của Tensors ==========

print("\n[1] CHECKING TENSOR SHAPES")
print("-" * 80)

def check_tensor_shapes():
    """Kiểm tra shape của các tensor trong training"""
    print("Sample batch shapes:")
    
    # Lấy một batch để kiểm tra
    for src, src_lengths, trg, trg_lengths in train_loader:
        print(f"  src shape:          {src.shape} (batch, seq_len)")
        print(f"  src_lengths shape:  {src_lengths.shape}")
        print(f"  trg shape:          {trg.shape}")
        print(f"  trg_lengths shape:  {trg_lengths.shape}")
        
        src = src.to(device)
        src_lengths = src_lengths.to(device)
        trg = trg.to(device)
        
        # Forward pass (training mode)
        model.train()
        outputs = model(src, src_lengths, trg)
        
        print(f"\n  model output shape: {outputs.shape} (batch, seq_len, vocab_size)")
        print(f"  Expected: ({src.size(0)}, {trg.size(1)}, {len(vocab_de)})")
        
        # Kiểm tra loss
        vocab_size = outputs.size(-1)
        pred = outputs[:, 1:, :].contiguous().view(-1, vocab_size)
        target = trg[:, 1:].contiguous().view(-1)
        
        print(f"\n  pred shape (after reshape): {pred.shape}")
        print(f"  target shape (after reshape): {target.shape}")
        
        loss = criterion(pred, target)
        print(f"  loss: {loss.item():.4f}")
        
        break

check_tensor_shapes()


# ========== 2. Kiểm tra Data Normalization ==========

print("\n\n[2] CHECKING DATA NORMALIZATION")
print("-" * 80)

def check_data_stats():
    """Kiểm tra thống kê dữ liệu: độ dài câu, phân bố từ"""
    
    # Độ dài câu
    en_lengths = [len(s.split()) for s in train_en]
    de_lengths = [len(s.split()) for s in train_de]
    
    print("English sentence lengths:")
    print(f"  Min: {min(en_lengths)}, Max: {max(en_lengths)}, Mean: {np.mean(en_lengths):.1f}")
    print(f"  Median: {np.median(en_lengths):.1f}, Std: {np.std(en_lengths):.1f}")
    
    print("\nGerman sentence lengths:")
    print(f"  Min: {min(de_lengths)}, Max: {max(de_lengths)}, Mean: {np.mean(de_lengths):.1f}")
    print(f"  Median: {np.median(de_lengths):.1f}, Std: {np.std(de_lengths):.1f}")
    
    # Cảnh báo nếu có câu quá dài
    max_len_threshold = 50
    en_too_long = sum(1 for l in en_lengths if l > max_len_threshold)
    de_too_long = sum(1 for l in de_lengths if l > max_len_threshold)
    
    print(f"\nSentences longer than {max_len_threshold} tokens:")
    print(f"  EN: {en_too_long} ({100*en_too_long/len(en_lengths):.1f}%)")
    print(f"  DE: {de_too_long} ({100*de_too_long/len(de_lengths):.1f}%)")
    
    if en_too_long > 0 or de_too_long > 0:
        print("\n  ⚠️ TIP: Consider filtering sentences > 50 tokens to reduce memory usage")
        print("         and improve training stability")

check_data_stats()


# ========== 3. Learning Rate & Gradient Check ==========

print("\n\n[3] CHECKING LEARNING RATE & GRADIENTS")
print("-" * 80)

def check_gradients():
    """Kiểm tra gradient flow"""
    model.train()
    
    # Lấy một batch
    for src, src_lengths, trg, trg_lengths in train_loader:
        src = src.to(device)
        src_lengths = src_lengths.to(device)
        trg = trg.to(device)
        
        optimizer.zero_grad()
        outputs = model(src, src_lengths, trg)
        vocab_size = outputs.size(-1)
        
        pred = outputs[:, 1:, :].contiguous().view(-1, vocab_size)
        target = trg[:, 1:].contiguous().view(-1)
        
        loss = criterion(pred, target)
        loss.backward()
        
        # Kiểm tra gradient norm
        total_norm = 0
        for p in model.parameters():
            if p.grad is not None:
                param_norm = p.grad.data.norm(2)
                total_norm += param_norm.item() ** 2
        total_norm = total_norm ** 0.5
        
        print(f"Gradient Norm: {total_norm:.4f}")
        print(f"Learning Rate: {optimizer.param_groups[0]['lr']:.6f}")
        
        if total_norm > 100:
            print("⚠️  WARNING: Large gradient norm detected!")
            print("   - Consider increasing CLIP value or reducing learning rate")
        elif total_norm < 0.0001:
            print("⚠️  WARNING: Very small gradient norm!")
            print("   - Check if loss is saturating or learning rate is too small")
        else:
            print("✓ Gradient norm looks reasonable")
        
        break

check_gradients()


# ========== 4. Teacher Forcing Analysis ==========

print("\n\n[4] TEACHER FORCING & EXPOSURE BIAS")
print("-" * 80)

print(f"Current teacher_forcing_ratio: {model.teacher_forcing_ratio}")
print("\nRecommendations:")
print("  - Start with 0.5 (50% ground truth, 50% predictions)")
print("  - Use scheduled sampling: gradually decrease ratio during training")
print("  - Formula: tf_ratio = initial * exp(-decay * epoch)")
print("\nImplementation example:")
print("""
# Scheduled teacher forcing
def get_tf_ratio(epoch, initial_tf=0.5, decay=0.05):
    return initial_tf * math.exp(-decay * epoch)

# In training loop:
model.teacher_forcing_ratio = get_tf_ratio(epoch)
""")


# ========== 5. Overfitting Check ==========

print("\n\n[5] OVERFITTING DETECTION")
print("-" * 80)

if len(history["train_loss"]) > 2 and len(history["val_loss"]) > 2:
    train_loss_trend = history["train_loss"][-1] < history["train_loss"][0]
    val_loss_trend = history["val_loss"][-1] > history["val_loss"][0]
    
    gap = history["val_loss"][-1] - history["train_loss"][-1]
    
    print(f"Training Loss (first vs last): {history['train_loss'][0]:.4f} → {history['train_loss'][-1]:.4f}")
    print(f"Validation Loss (first vs last): {history['val_loss'][0]:.4f} → {history['val_loss'][-1]:.4f}")
    print(f"Train-Val Gap: {gap:.4f}")
    
    if gap > 0.5:
        print("\n⚠️  WARNING: Significant overfitting detected!")
        print("\nSolutions:")
        print("  1. Increase dropout (currently 0.3)")
        print("  2. Add L2 regularization (weight decay)")
        print("  3. Use early stopping (already enabled)")
        print("  4. Filter long sentences (max 50 tokens)")
        print("  5. Increase batch size")
    else:
        print("\n✓ Overfitting levels look reasonable")
else:
    print("Not enough epochs completed yet to assess overfitting")


# ========== 6. Loss Not Decreasing - Diagnostic ==========

print("\n\n[6] DIAGNOSING 'LOSS NOT DECREASING' ISSUES")
print("-" * 80)

print("""
Common causes and solutions:

1. LEARNING RATE TOO HIGH
   - Symptom: Loss oscillates or increases
   - Solution: Reduce LR (e.g., 0.001 → 0.0005)
   
2. LEARNING RATE TOO LOW
   - Symptom: Loss decreases very slowly
   - Solution: Increase LR (e.g., 0.0001 → 0.001)
   
3. GRADIENT VANISHING/EXPLODING
   - Symptom: Loss becomes NaN or Inf
   - Solution: Check gradient norm, increase CLIP value, use gradient clipping
   
4. BAD DATA
   - Symptom: Loss plateaus at high value
   - Solution: Check data quality, verify tokenization, ensure padding is correct
   
5. MODEL TOO SMALL
   - Symptom: Slow improvement on training set
   - Solution: Increase embed_dim, hidden_size, or num_layers
   
6. BATCH SIZE ISSUES
   - Too small: Noisy gradients, slow training
   - Too large: Memory issues, poor generalization
   - Try: 32, 64, 128
""")


# ========== 7. Memory & Performance Tips ==========

print("\n\n[7] MEMORY & PERFORMANCE OPTIMIZATION")
print("-" * 80)

print("""
Memory-saving strategies:

1. FILTER LONG SENTENCES
   - Limit to max_len=50 tokens
   - Code example:
   
   def filter_by_length(en_sents, de_sents, max_len=50):
       data = [(en, de) for en, de in zip(en_sents, de_sents)
               if len(en.split()) <= max_len and len(de.split()) <= max_len]
       en_filtered, de_filtered = zip(*data)
       return list(en_filtered), list(de_filtered)
   
   train_en, train_de = filter_by_length(train_en, train_de, max_len=50)

2. REDUCE VOCAB SIZE
   - Currently: 10,000 words
   - Try: 5,000 or 8,000
   - Trade-off: Less <unk> tokens vs smaller model

3. REDUCE EMBEDDING/HIDDEN DIMENSION
   - Current: embed_dim=512, hidden_size=512
   - Try: 256 or 384
   - Still gets decent results with lower memory

4. USE GRADIENT ACCUMULATION (if needed)
   - Simulate larger batch size with smaller batches
   
5. MIXED PRECISION (if using CUDA)
   - Use torch.cuda.amp for faster computation
""")


# ========== 8. Monitoring Checklist ==========

print("\n\n[8] TRAINING MONITORING CHECKLIST")
print("-" * 80)

checklist = {
    "Tensor shapes": "✓ Verify in [1]",
    "Data stats": "✓ Check in [2]",
    "Gradient flow": "✓ Monitor in [3]",
    "Teacher forcing": "✓ Review in [4]",
    "Overfitting": "✓ Assess in [5]",
    "Learning rate": "Adjust based on loss curve",
    "Loss trend": "Should decrease monotonically (with fluctuations)",
    "Validation loss": "Should decrease, gap with train loss < 0.5",
    "Checkpoints": "Save best model (already doing)",
    "Early stopping": "Patience=3 (already enabled)",
}

for item, status in checklist.items():
    print(f"  ☐ {item:30s} - {status}")


# ========== 9. Quick Debugging Code ==========

print("\n\n[9] QUICK DEBUG: Run this if loss gets stuck")
print("-" * 80)

debug_code = """
# Step 1: Check a single batch
src, src_lengths, trg, trg_lengths = next(iter(train_loader))
print("Input shapes OK:", src.shape, trg.shape)

# Step 2: Forward pass
model.eval()
with torch.no_grad():
    out = model(src.to(device), src_lengths.to(device), trg.to(device))
    print("Output shape OK:", out.shape)

# Step 3: Compute loss manually
pred = out[:, 1:, :].contiguous().view(-1, len(vocab_de))
target = trg[:, 1:].contiguous().view(-1)
loss = criterion(pred, target)
print("Loss OK:", loss.item())

# Step 4: Check for NaN/Inf
print("Contains NaN:", torch.isnan(out).any().item())
print("Contains Inf:", torch.isinf(out).any().item())
"""

print(debug_code)

print("\n" + "="*80)
print("END TROUBLESHOOTING GUIDE")
print("="*80)


TROUBLESHOOTING & DEBUGGING GUIDE

[1] CHECKING TENSOR SHAPES
--------------------------------------------------------------------------------
Sample batch shapes:
  src shape:          torch.Size([64, 32]) (batch, seq_len)
  src_lengths shape:  torch.Size([64])
  trg shape:          torch.Size([64, 25])
  trg_lengths shape:  torch.Size([64])

  model output shape: torch.Size([64, 25, 10000]) (batch, seq_len, vocab_size)
  Expected: (64, 25, 10000)

  pred shape (after reshape): torch.Size([1536, 10000])
  target shape (after reshape): torch.Size([1536])
  loss: 1.2930


[2] CHECKING DATA NORMALIZATION
--------------------------------------------------------------------------------
English sentence lengths:
  Min: 3, Max: 37, Mean: 11.9
  Median: 11.0, Std: 3.8

German sentence lengths:
  Min: 1, Max: 39, Mean: 11.1
  Median: 11.0, Std: 3.8

Sentences longer than 50 tokens:
  EN: 0 (0.0%)
  DE: 0 (0.0%)


[3] CHECKING LEARNING RATE & GRADIENTS
-----------------------------------------