In [1]:
import re
import os
import time
import math
from collections import Counter

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from nltk.translate.bleu_score import sentence_bleu
train_en_path = "/kaggle/input/en-vi-ds/data/train.en"
train_vi_path = "/kaggle/input/en-vi-ds/data/train.vi"
test_en_path  = "/kaggle/input/en-vi-ds/data/tst2013.en"
test_vi_path  = "/kaggle/input/en-vi-ds/data/tst2013.vi"

def load_file(path):
    with open(path, "r", encoding="utf-8") as f:
        return [line.strip() for line in f.readlines()]

train_en = load_file(train_en_path)
train_vi = load_file(train_vi_path)
test_en  = load_file(test_en_path)
test_vi  = load_file(test_vi_path)

print("TRAIN EN lines:", len(train_en))
print("TRAIN VI lines:", len(train_vi))
print("TEST  EN lines:", len(test_en))
print("TEST  VI lines:", len(test_vi))

if len(train_en) != len(train_vi):
    print("❌ Lỗi: train.en và train.vi không khớp số dòng!")
else:
    print("✅ Train dataset khớp số dòng!")

if len(test_en) != len(test_vi):
    print("❌ Lỗi: test.en và test.vi không khớp số dòng!")
else:
    print("✅ Test dataset khớp số dòng!")

# In thử 3 cặp đầu tiên để xem có đúng là song ngữ
print("\n--- SAMPLE TRAIN PAIRS ---")
for i in range(3):
    print(f"[EN] {train_en[i]}")
    print(f"[VI] {train_vi[i]}")
    print()

print("\n--- SAMPLE TEST PAIRS ---")
for i in range(3):
    print(f"[EN] {test_en[i]}")
    print(f"[VI] {test_vi[i]}")
    print()

TRAIN EN lines: 133317
TRAIN VI lines: 133317
TEST  EN lines: 1268
TEST  VI lines: 1268
✅ Train dataset khớp số dòng!
✅ Test dataset khớp số dòng!

--- SAMPLE TRAIN PAIRS ---
[EN] Rachel Pike : The science behind a climate headline
[VI] Khoa học đằng sau một tiêu đề về khí hậu

[EN] In 4 minutes , atmospheric chemist Rachel Pike provides a glimpse of the massive scientific effort behind the bold headlines on climate change , with her team -- one of thousands who contributed -- taking a risky flight over the rainforest in pursuit of data on a key molecule .
[VI] Trong 4 phút , chuyên gia hoá học khí quyển Rachel Pike giới thiệu sơ lược về những nỗ lực khoa học miệt mài đằng sau những tiêu đề táo bạo về biến đổi khí hậu , cùng với đoàn nghiên cứu của mình -- hàng ngàn người đã cống hiến cho dự án này -- một chuyến bay mạo hiểm qua rừng già để tìm kiếm thông tin về một phân tử then chốt .

[EN] I &apos;d like to talk to you today about the scale of the scientific effort that goes into mak

In [2]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
import math
import time
from collections import Counter
from nltk.translate.bleu_score import sentence_bleu
import os

In [3]:
class Tokenizer:
    def __init__(self, texts, min_freq=2):
        self.word2idx = {"<pad>":0, "<sos>":1, "<eos>":2, "<unk>":3}
        self.idx2word = {v:k for k,v in self.word2idx.items()}
        self.build_vocab(texts, min_freq)

    def build_vocab(self, texts, min_freq):
        counter = Counter()
        for line in texts:
            counter.update(line.lower().strip().split())
        for word, freq in counter.items():
            if freq >= min_freq:
                idx = len(self.word2idx)
                self.word2idx[word] = idx
                self.idx2word[idx] = word

    def encode(self, text):
        return [self.word2idx.get(tok, 3) for tok in text.lower().strip().split()]

    def decode(self, ids):
        words = []
        for i in ids:
            if i == 2: break      # stop at <eos>
            if i > 3: words.append(self.idx2word.get(i, "<unk>"))
        return " ".join(words)

In [4]:
class TranslationDataset(Dataset):
    def __init__(self, src, trg, tok_src, tok_trg):
        self.src = src
        self.trg = trg
        self.tok_src = tok_src
        self.tok_trg = tok_trg

    def __len__(self): return len(self.src)

    def __getitem__(self, idx):
        s = [1] + self.tok_src.encode(self.src[idx]) + [2]   # <sos> ... <eos>
        t = [1] + self.tok_trg.encode(self.trg[idx]) + [2]
        return torch.tensor(s), torch.tensor(t)


In [5]:
def collate_fn(batch, pad_idx=0):
    src, trg = zip(*batch)
    src = nn.utils.rnn.pad_sequence(src, batch_first=True, padding_value=pad_idx)
    trg = nn.utils.rnn.pad_sequence(trg, batch_first=True, padding_value=pad_idx)
    return src, trg


In [6]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(max_len).unsqueeze(1)
        div = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000)/d_model))
        pe[:,0::2] = torch.sin(pos*div)
        pe[:,1::2] = torch.cos(pos*div)
        self.pe = pe.unsqueeze(0)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)].to(x.device)


In [7]:
class TransformerModel(nn.Module):
    def __init__(self, src_vocab, trg_vocab, d_model=256, nhead=4, num_layers=3):
        super().__init__()
        self.src_emb = nn.Embedding(src_vocab, d_model)
        self.trg_emb = nn.Embedding(trg_vocab, d_model)
        self.pos = PositionalEncoding(d_model)
        self.transformer = nn.Transformer(
            d_model=d_model, nhead=nhead,
            num_encoder_layers=num_layers,
            num_decoder_layers=num_layers,
            batch_first=True
        )
        self.fc = nn.Linear(d_model, trg_vocab)

    def forward(self, src, trg):
        src = self.pos(self.src_emb(src))
        trg = self.pos(self.trg_emb(trg))
        tgt_mask = nn.Transformer.generate_square_subsequent_mask(trg.size(1)).to(trg.device)
        out = self.transformer(src, trg, tgt_mask=tgt_mask)
        return self.fc(out)


In [8]:
def train_model(model, train_loader, val_loader, opt, loss_fn, device, epochs=10):
    best = 999
    for ep in range(epochs):
        model.train()
        total = 0
        for src, trg in train_loader:
            src, trg = src.to(device), trg.to(device)
            opt.zero_grad()
            out = model(src, trg[:,:-1])
            loss = loss_fn(out.reshape(-1, out.size(-1)), trg[:,1:].reshape(-1))
            loss.backward()
            opt.step()
            total += loss.item()

        # VAL
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for src, trg in val_loader:
                src, trg = src.to(device), trg.to(device)
                out = model(src, trg[:,:-1])
                loss = loss_fn(out.reshape(-1, out.size(-1)), trg[:,1:].reshape(-1))
                val_loss += loss.item()

        print(f"Epoch {ep+1}/{epochs} | Train {total/len(train_loader):.4f} | Val {val_loss/len(val_loader):.4f}")

        if val_loss < best:
            best = val_loss
            torch.save(model.state_dict(), "best_model.pt")
            print("✔ Saved best model!")


In [9]:
def translate(model, text, tok_src, tok_trg, device, max_len=60):
    model.eval()
    src = [1] + tok_src.encode(text) + [2]
    src = torch.tensor(src).unsqueeze(0).to(device)

    trg = torch.tensor([[1]]).to(device)  # <sos>

    for _ in range(max_len):
        out = model(src, trg)
        next_tok = out[0, -1].argmax().item()
        trg = torch.cat([trg, torch.tensor([[next_tok]]).to(device)], dim=1)
        if next_tok == 2: break

    return tok_trg.decode(trg[0].tolist()[1:])


In [10]:
def evaluate_test_set(model, test_en, test_vi, tok_src, tok_trg, device, n=50):
    model.eval()
    total_bleu = 0
    for i in range(n):
        pred = translate(model, test_en[i], tok_src, tok_trg, device)
        bleu = sentence_bleu([test_vi[i].split()], pred.split())
        total_bleu += bleu
        print(f"\nEN: {test_en[i]}")
        print(f"GT: {test_vi[i]}")
        print(f"PR: {pred}")
        print(f"BLEU: {bleu:.4f}")

    print("\nAVERAGE BLEU =", total_bleu/n)


In [11]:
train_en = open("/kaggle/input/en-vi-ds/data/train.en").read().splitlines()
train_vi = open("/kaggle/input/en-vi-ds/data/train.vi").read().splitlines()
test_en  = open("/kaggle/input/en-vi-ds/data/tst2013.en").read().splitlines()
test_vi  = open("/kaggle/input/en-vi-ds/data/tst2013.vi").read().splitlines()


In [12]:
tok_src = Tokenizer(train_en)
tok_trg = Tokenizer(train_vi)

dataset = TranslationDataset(train_en, train_vi, tok_src, tok_trg)

train_len = int(0.9 * len(dataset))
val_len = len(dataset) - train_len
train_set, val_set = random_split(dataset, [train_len, val_len])

train_loader = DataLoader(train_set, batch_size=16, shuffle=True, collate_fn=collate_fn)
val_loader   = DataLoader(val_set, batch_size=16, shuffle=False, collate_fn=collate_fn)


In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TransformerModel(len(tok_src.word2idx), len(tok_trg.word2idx)).to(device)

opt = torch.optim.Adam(model.parameters(), lr=1e-4)
loss_fn = nn.CrossEntropyLoss(ignore_index=0)

train_model(model, train_loader, val_loader, opt, loss_fn, device, epochs=20)


Epoch 1/20 | Train 4.4170 | Val 3.6909
Epoch 2/20 | Train 3.5186 | Val 3.2390
Epoch 3/20 | Train 3.1379 | Val 2.9518
Epoch 4/20 | Train 2.8715 | Val 2.7651
Epoch 5/20 | Train 2.6742 | Val 2.6175
Epoch 6/20 | Train 2.5227 | Val 2.5203
Epoch 7/20 | Train 2.4008 | Val 2.4464
Epoch 8/20 | Train 2.3007 | Val 2.3852
Epoch 9/20 | Train 2.2158 | Val 2.3500
Epoch 10/20 | Train 2.1440 | Val 2.3147
Epoch 11/20 | Train 2.0816 | Val 2.2908
Epoch 12/20 | Train 2.0252 | Val 2.2702
Epoch 13/20 | Train 1.9762 | Val 2.2517
Epoch 14/20 | Train 1.9317 | Val 2.2364
Epoch 15/20 | Train 1.8905 | Val 2.2258
Epoch 16/20 | Train 1.8552 | Val 2.2108
Epoch 17/20 | Train 1.8204 | Val 2.2111
Epoch 18/20 | Train 1.7893 | Val 2.2061
Epoch 19/20 | Train 1.7594 | Val 2.2050
Epoch 20/20 | Train 1.7332 | Val 2.1998


In [14]:
print(translate(model, "I love machine learning.", tok_src, tok_trg, device))
print(translate(model, "Today the weather is nice.", tok_src, tok_trg, device))
print(translate(model, "He is a very smart engineer.", tok_src, tok_trg, device))


tôi thích máy .
ngày nay thời tiết là thời tiết kiệm thời gian rảnh rỗi .
anh ta là một người rất thông minh .
