<a href="https://colab.research.google.com/github/Anphan0612/NLP/blob/main/notebooks/seq2seq_en_fr.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Seq2Seq LSTM English→French (Multi30K)

- Encoder-Decoder LSTM with fixed context vector, teacher forcing, greedy decoding.
- Optional Luong attention + beam search for extended experiments.
- Dataset: Multi30K en-fr raw (train/val/test) downloaded from GitHub.
- Outputs: `best_model.pth`, `translate(sentence)`, BLEU score, loss plots, 5 example translations + error analysis notes.

> Run this notebook end-to-end on Colab GPU (Python 3, torch ≥1.13).


In [None]:
# Cài gói
%pip install -q torch==2.4.1 torchtext==0.18.0 spacy==3.7.4 nltk==3.9.1 tqdm matplotlib
%pip install -q sentencepiece  # tùy chọn nếu cần BPE sau này

import spacy, nltk
spacy.cli.download("en_core_web_sm")
spacy.cli.download("fr_core_news_sm")
nltk.download("punkt")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m797.0/797.0 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m63.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.5/6.5 MB[0m [31m90.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.6/410.6 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m76.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m68.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m42.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import os
import math
import random
import json
from pathlib import Path
from typing import List, Tuple

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
import torchtext
from torchtext.vocab import build_vocab_from_iterator
import spacy
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import matplotlib.pyplot as plt
from tqdm import tqdm

# Reproducibility
SEED = 42
random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)


In [None]:
# Paths
DATA_DIR = Path("data/multi30k_en_fr")
RAW_DIR = DATA_DIR / "raw"
PROCESSED_DIR = DATA_DIR / "processed"
CHECKPOINT_DIR = Path("checkpoints")
for p in [RAW_DIR, PROCESSED_DIR, CHECKPOINT_DIR]:
    p.mkdir(parents=True, exist_ok=True)

# Download Multi30K en-fr raw files if missing
urls = {
    "train.en": "https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/train.en",
    "train.fr": "https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/train.fr",
    "val.en": "https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/val.en",
    "val.fr": "https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/val.fr",
    "test.en": "https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/test_2016_flickr.en",
    "test.fr": "https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/test_2016_flickr.fr",
}

import requests

def download_file(url, dest):
    if dest.exists():
        return
    print(f"Downloading {dest.name} ...")
    r = requests.get(url, timeout=30)
    r.raise_for_status()
    dest.write_bytes(r.content)

for fname, url in urls.items():
    download_file(url, RAW_DIR / fname)

print("Data ready at", RAW_DIR.resolve())


In [None]:
# Load spaCy tokenizers
spacy_en = spacy.load("en_core_web_sm")
spacy_fr = spacy.load("fr_core_news_sm")

def tokenize_en(text: str) -> List[str]:
    return [tok.text.lower() for tok in spacy_en.tokenizer(text.strip())]


def tokenize_fr(text: str) -> List[str]:
    return [tok.text.lower() for tok in spacy_fr.tokenizer(text.strip())]

SPECIALS = ["<unk>", "<pad>", "<sos>", "<eos>"]
UNK_IDX, PAD_IDX, SOS_IDX, EOS_IDX = 0, 1, 2, 3


In [None]:
def read_parallel(split: str) -> Tuple[List[str], List[str]]:
    en_path = RAW_DIR / f"{split}.en"
    fr_path = RAW_DIR / f"{split}.fr"
    with en_path.open("r", encoding="utf-8") as f_en, fr_path.open("r", encoding="utf-8") as f_fr:
        en_lines = [line.strip() for line in f_en]
        fr_lines = [line.strip() for line in f_fr]
    assert len(en_lines) == len(fr_lines), "Mismatched parallel data"
    return en_lines, fr_lines


def yield_tokens(lines: List[str], tokenizer):
    for line in lines:
        yield tokenizer(line)


def build_vocabs(max_size=10000, min_freq=2):
    train_en, train_fr = read_parallel("train")
    en_vocab = build_vocab_from_iterator(
        yield_tokens(train_en, tokenize_en),
        specials=SPECIALS,
        max_tokens=max_size,
        min_freq=min_freq,
    )
    fr_vocab = build_vocab_from_iterator(
        yield_tokens(train_fr, tokenize_fr),
        specials=SPECIALS,
        max_tokens=max_size,
        min_freq=min_freq,
    )
    en_vocab.set_default_index(UNK_IDX)
    fr_vocab.set_default_index(UNK_IDX)
    return en_vocab, fr_vocab


en_vocab, fr_vocab = build_vocabs()
print("Vocab sizes:", len(en_vocab), len(fr_vocab))


In [None]:
class ParallelTextDataset(Dataset):
    def __init__(self, split: str):
        self.en_lines, self.fr_lines = read_parallel(split)
        self.split = split

    def __len__(self):
        return len(self.en_lines)

    def __getitem__(self, idx):
        en_tok = tokenize_en(self.en_lines[idx])
        fr_tok = tokenize_fr(self.fr_lines[idx])
        en_ids = [SOS_IDX] + [en_vocab[t] for t in en_tok] + [EOS_IDX]
        fr_ids = [SOS_IDX] + [fr_vocab[t] for t in fr_tok] + [EOS_IDX]
        return torch.tensor(en_ids, dtype=torch.long), torch.tensor(fr_ids, dtype=torch.long)


def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)
    # sort by src length desc for packing
    src_batch = list(src_batch)
    tgt_batch = list(tgt_batch)
    lengths = torch.tensor([len(x) for x in src_batch])
    sorted_idx = torch.argsort(lengths, descending=True)
    src_batch = [src_batch[i] for i in sorted_idx]
    tgt_batch = [tgt_batch[i] for i in sorted_idx]
    src_padded = pad_sequence(src_batch, padding_value=PAD_IDX)
    tgt_padded = pad_sequence(tgt_batch, padding_value=PAD_IDX)
    return src_padded.to(device), tgt_padded.to(device), lengths[sorted_idx].to(device)


BATCH_SIZE = 64
train_ds = ParallelTextDataset("train")
val_ds = ParallelTextDataset("val")
test_ds = ParallelTextDataset("test")

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

print("Batches:", len(train_loader), len(val_loader), len(test_loader))


In [None]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers=2, dropout=0.3, bidirectional=False):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=PAD_IDX)
        self.rnn = nn.LSTM(emb_dim, hid_dim, num_layers=n_layers, dropout=dropout, bidirectional=bidirectional)
        self.dropout = nn.Dropout(dropout)
        self.bidirectional = bidirectional

    def forward(self, src, src_lengths):
        # src: [src_len, batch]
        embedded = self.dropout(self.embedding(src))
        packed = pack_padded_sequence(embedded, src_lengths.cpu(), enforce_sorted=True)
        outputs, (hidden, cell) = self.rnn(packed)
        outputs, _ = pad_packed_sequence(outputs)  # [src_len, batch, hid_dim * num_directions]
        return outputs, hidden, cell


class LuongAttention(nn.Module):
    def __init__(self, hid_dim):
        super().__init__()
        self.attn = nn.Linear(hid_dim, hid_dim, bias=False)

    def forward(self, hidden, encoder_outputs, mask=None):
        # hidden: [batch, hid_dim]; encoder_outputs: [src_len, batch, hid_dim]
        scores = torch.einsum("bh,sbh->bs", self.attn(hidden), encoder_outputs)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        attn_weights = torch.softmax(scores, dim=1)  # [batch, src_len]
        context = torch.einsum("bs,sbh->bh", attn_weights, encoder_outputs)
        return context, attn_weights


class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers=2, dropout=0.3, use_attention=False):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim, padding_idx=PAD_IDX)
        self.rnn = nn.LSTM(emb_dim, hid_dim, num_layers=n_layers, dropout=dropout)
        self.fc_out = nn.Linear(hid_dim * (2 if use_attention else 1), output_dim)
        self.dropout = nn.Dropout(dropout)
        self.use_attention = use_attention
        self.attention = LuongAttention(hid_dim) if use_attention else None

    def forward(self, input, hidden, cell, encoder_outputs=None, mask=None):
        # input: [batch]; hidden/cell: [n_layers, batch, hid_dim]
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))  # [1, batch, emb_dim]
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        # output: [1, batch, hid_dim]
        output = output.squeeze(0)
        if self.use_attention:
            context, attn_weights = self.attention(output, encoder_outputs, mask)
            output_cat = torch.cat((output, context), dim=1)
            prediction = self.fc_out(output_cat)
        else:
            prediction = self.fc_out(output)
            attn_weights = None
        return prediction, hidden, cell, attn_weights


class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, pad_idx=PAD_IDX, use_attention=False):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.pad_idx = pad_idx
        self.use_attention = use_attention

    def make_mask(self, src):
        # src: [src_len, batch]
        return (src != self.pad_idx).permute(1, 0)  # [batch, src_len]

    def forward(self, src, src_lengths, trg, teacher_forcing_ratio=0.5):
        # src: [src_len, batch]; trg: [trg_len, batch]
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.embedding.num_embeddings

        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size, device=device)
        encoder_outputs, hidden, cell = self.encoder(src, src_lengths)
        input = trg[0, :]  # first token = <sos>
        mask = self.make_mask(src) if self.use_attention else None

        for t in range(1, trg_len):
            output, hidden, cell, _ = self.decoder(input, hidden, cell, encoder_outputs, mask)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[t] if teacher_force else top1
        return outputs


In [None]:
# Hyperparameters and model init
EMB_DIM = 300
HID_DIM = 512
N_LAYERS = 2
DROPOUT = 0.3
USE_ATTENTION = True  # set False to train baseline without attention
BEAM_SIZE = 5
MAX_LEN = 50
TEACHER_FORCING = 0.5
LR = 1e-3
EPOCHS = 12
PATIENCE = 3

enc = Encoder(len(en_vocab), EMB_DIM, HID_DIM, n_layers=N_LAYERS, dropout=DROPOUT)
dec = Decoder(len(fr_vocab), EMB_DIM, HID_DIM, n_layers=N_LAYERS, dropout=DROPOUT, use_attention=USE_ATTENTION)
model = Seq2Seq(enc, dec, pad_idx=PAD_IDX, use_attention=USE_ATTENTION).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=0.5, patience=1)



In [None]:
def epoch_time(start, end):
    elapsed = end - start
    return int(elapsed // 60), int(elapsed % 60)


def train_one_epoch(model, loader, optimizer, criterion, teacher_forcing=0.5):
    model.train()
    epoch_loss = 0
    for src, trg, src_lengths in tqdm(loader, desc="Train", leave=False):
        optimizer.zero_grad()
        outputs = model(src, src_lengths, trg, teacher_forcing_ratio=teacher_forcing)
        # outputs: [trg_len, batch, vocab]
        output_dim = outputs.shape[-1]
        outputs_flat = outputs[1:].reshape(-1, output_dim)
        trg_flat = trg[1:].reshape(-1)
        loss = criterion(outputs_flat, trg_flat)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(loader)


def evaluate(model, loader, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for src, trg, src_lengths in tqdm(loader, desc="Val", leave=False):
            outputs = model(src, src_lengths, trg, teacher_forcing_ratio=0.0)
            output_dim = outputs.shape[-1]
            outputs_flat = outputs[1:].reshape(-1, output_dim)
            trg_flat = trg[1:].reshape(-1)
            loss = criterion(outputs_flat, trg_flat)
            epoch_loss += loss.item()
    return epoch_loss / len(loader)



In [None]:
import time

best_val = float("inf")
patience_counter = 0
train_losses, val_losses = [], []
best_path = CHECKPOINT_DIR / "best_model.pth"

for epoch in range(1, EPOCHS + 1):
    start_time = time.time()
    train_loss = train_one_epoch(model, train_loader, optimizer, criterion, teacher_forcing=TEACHER_FORCING)
    val_loss = evaluate(model, val_loader, criterion)
    scheduler.step(val_loss)
    train_losses.append(train_loss)
    val_losses.append(val_loss)

    if val_loss < best_val:
        best_val = val_loss
        patience_counter = 0
        torch.save({"model_state": model.state_dict(), "config": {
            "EMB_DIM": EMB_DIM,
            "HID_DIM": HID_DIM,
            "N_LAYERS": N_LAYERS,
            "DROPOUT": DROPOUT,
            "USE_ATTENTION": USE_ATTENTION,
            "EN_VOCAB": len(en_vocab),
            "FR_VOCAB": len(fr_vocab),
            "PAD_IDX": PAD_IDX,
        }}, best_path)
    else:
        patience_counter += 1

    mins, secs = epoch_time(start_time, time.time())
    print(f"Epoch {epoch}/{EPOCHS} | Train {train_loss:.3f} | Val {val_loss:.3f} | Time {mins}m {secs}s")

    if patience_counter >= PATIENCE:
        print("Early stopping triggered")
        break

print("Best checkpoint:", best_path)



In [None]:
# Plot training curves
plt.figure(figsize=(6,4))
plt.plot(train_losses, label="train")
plt.plot(val_losses, label="val")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Train/Val Loss")
plt.legend()
plt.grid(True)
plt.show()



In [None]:
# Inference helpers

def ids_to_sentence(ids, vocab):
    tokens = []
    for i in ids:
        if i == EOS_IDX:
            break
        if i not in (SOS_IDX, PAD_IDX):
            tokens.append(vocab.get_itos()[i])
    return " ".join(tokens)


def encode_sentence(sentence, tokenizer, vocab):
    toks = [t for t in tokenizer(sentence)]
    ids = [SOS_IDX] + [vocab[t] for t in toks] + [EOS_IDX]
    return torch.tensor(ids, dtype=torch.long, device=device)


def greedy_decode(model, sentence, max_len=50):
    model.eval()
    with torch.no_grad():
        src_tensor = encode_sentence(sentence, tokenize_en, en_vocab).unsqueeze(1)
        src_len = torch.tensor([src_tensor.shape[0]], device=device)
        encoder_outputs, hidden, cell = model.encoder(src_tensor, src_len)
        mask = model.make_mask(src_tensor) if model.use_attention else None
        input_token = torch.tensor([SOS_IDX], device=device)
        preds = []
        for _ in range(max_len):
            output, hidden, cell, _ = model.decoder(input_token, hidden, cell, encoder_outputs, mask)
            top1 = output.argmax(1)
            if top1.item() == EOS_IDX:
                break
            preds.append(top1.item())
            input_token = top1
        return ids_to_sentence(preds, fr_vocab)


def beam_search_decode(model, sentence, beam_size=5, max_len=50):
    model.eval()
    with torch.no_grad():
        src_tensor = encode_sentence(sentence, tokenize_en, en_vocab).unsqueeze(1)
        src_len = torch.tensor([src_tensor.shape[0]], device=device)
        encoder_outputs, hidden, cell = model.encoder(src_tensor, src_len)
        mask = model.make_mask(src_tensor) if model.use_attention else None

        beams = [(0.0, [SOS_IDX], hidden, cell)]
        completed = []
        for _ in range(max_len):
            new_beams = []
            for log_prob, seq, h, c in beams:
                inp = torch.tensor([seq[-1]], device=device)
                output, h_new, c_new, _ = model.decoder(inp, h, c, encoder_outputs, mask)
                probs = torch.log_softmax(output, dim=1)
                topk_logp, topk_idx = probs.topk(beam_size, dim=1)
                for k in range(beam_size):
                    next_token = topk_idx[0, k].item()
                    new_log_prob = log_prob + topk_logp[0, k].item()
                    new_seq = seq + [next_token]
                    if next_token == EOS_IDX:
                        completed.append((new_log_prob, new_seq))
                    else:
                        new_beams.append((new_log_prob, new_seq, h_new, c_new))
            beams = sorted(new_beams, key=lambda x: x[0], reverse=True)[:beam_size]
            if not beams:
                break
        if completed:
            best = max(completed, key=lambda x: x[0])
        else:
            best = max(beams, key=lambda x: x[0])
        return ids_to_sentence(best[1], fr_vocab)


def translate(sentence, use_beam=False):
    if use_beam:
        return beam_search_decode(model, sentence, beam_size=BEAM_SIZE, max_len=MAX_LEN)
    return greedy_decode(model, sentence, max_len=MAX_LEN)



In [None]:
# BLEU evaluation on test set

def compute_bleu(model, loader, n_samples=None, use_beam=False):
    model.eval()
    smoothie = SmoothingFunction().method4
    scores = []
    with torch.no_grad():
        for i, (src, trg, src_lengths) in enumerate(tqdm(loader, desc="Test", leave=False)):
            # decode each sentence individually for simplicity
            for b in range(src.shape[1]):
                src_sent = src[:, b].tolist()
                trg_sent = trg[:, b].tolist()
                # reconstruct raw English sentence for translation
                en_tokens = [en_vocab.get_itos()[idx] for idx in src_sent if idx not in (PAD_IDX, SOS_IDX, EOS_IDX)]
                src_text = " ".join(en_tokens)
                pred = translate(src_text, use_beam=use_beam)
                ref_tokens = [fr_vocab.get_itos()[idx] for idx in trg_sent if idx not in (PAD_IDX, SOS_IDX, EOS_IDX, EOS_IDX)]
                scores.append(sentence_bleu([ref_tokens], pred.split(), smoothing_function=smoothie))
            if n_samples and len(scores) >= n_samples:
                break
    return sum(scores) / len(scores)

# Compute BLEU (may take time). Uncomment to run after training.
# bleu_score = compute_bleu(model, test_loader, n_samples=200, use_beam=False)
# print("BLEU (greedy):", bleu_score)



In [None]:
# Quick demo translations (run after training/loading checkpoint)
sentences = [
    "a man is riding a bicycle",
    "children are playing in the park",
    "a woman is sitting at a table",
    "two dogs are running on the beach",
    "the man holds a red umbrella",
]

# for s in sentences:
#     print("EN:", s)
#     print("FR (greedy):", translate(s, use_beam=False))
#     print("FR (beam):", translate(s, use_beam=True))
#     print("---")

