In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
import math
import time
from collections import Counter
from nltk.translate.bleu_score import sentence_bleu
import os

# -------------------------
# 1. Tokenizer & Vocab
# -------------------------
class Tokenizer:
    def __init__(self, texts, min_freq=2):
        self.word2idx = {"<pad>":0, "<sos>":1, "<eos>":2, "<unk>":3}
        self.idx2word = {0:"<pad>", 1:"<sos>", 2:"<eos>", 3:"<unk>"}
        self.build_vocab(texts, min_freq)
    
    def build_vocab(self, texts, min_freq):
        counter = Counter()
        for line in texts:
            tokens = line.strip().lower().split()
            counter.update(tokens)
        for word, freq in counter.items():
            if freq >= min_freq and word not in self.word2idx:
                idx = len(self.word2idx)
                self.word2idx[word] = idx
                self.idx2word[idx] = word
    
    def encode(self, text):
        tokens = text.strip().lower().split()
        return [self.word2idx.get(tok, self.word2idx["<unk>"]) for tok in tokens]
    
    def decode(self, ids):
        return " ".join([self.idx2word.get(i, "<unk>") for i in ids])

# -------------------------
# 2. Dataset
# -------------------------
class TranslationDataset(Dataset):
    def __init__(self, src_texts, trg_texts, src_tokenizer, trg_tokenizer):
        self.src_texts = src_texts
        self.trg_texts = trg_texts
        self.src_tokenizer = src_tokenizer
        self.trg_tokenizer = trg_tokenizer
    
    def __len__(self):
        return len(self.src_texts)
    
    def __getitem__(self, idx):
        src = [self.src_tokenizer.word2idx["<sos>"]] + self.src_tokenizer.encode(self.src_texts[idx]) + [self.src_tokenizer.word2idx["<eos>"]]
        trg = [self.trg_tokenizer.word2idx["<sos>"]] + self.trg_tokenizer.encode(self.trg_texts[idx]) + [self.trg_tokenizer.word2idx["<eos>"]]
        return torch.tensor(src, dtype=torch.long), torch.tensor(trg, dtype=torch.long)

def collate_fn(batch, pad_idx=0):
    src_batch, trg_batch = zip(*batch)
    src_batch = nn.utils.rnn.pad_sequence(src_batch, batch_first=True, padding_value=pad_idx)
    trg_batch = nn.utils.rnn.pad_sequence(trg_batch, batch_first=True, padding_value=pad_idx)
    return src_batch, trg_batch

# -------------------------
# 3. Positional Encoding
# -------------------------
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = pe.unsqueeze(0)

    def forward(self, x):
        return x + self.pe[:, :x.size(1), :].to(x.device)

# -------------------------
# 4. Transformer Model
# -------------------------
class TransformerModel(nn.Module):
    def __init__(self, src_vocab_size, trg_vocab_size, d_model=256, nhead=4, num_encoder_layers=2, num_decoder_layers=2, dim_feedforward=256, dropout=0.1):
        super().__init__()
        self.src_embed = nn.Embedding(src_vocab_size, d_model)
        self.trg_embed = nn.Embedding(trg_vocab_size, d_model)
        self.pos_enc = PositionalEncoding(d_model)
        self.transformer = nn.Transformer(d_model=d_model, nhead=nhead,
                                          num_encoder_layers=num_encoder_layers,
                                          num_decoder_layers=num_decoder_layers,
                                          dim_feedforward=dim_feedforward,
                                          dropout=dropout,
                                          batch_first=True)
        self.fc_out = nn.Linear(d_model, trg_vocab_size)
    
    def forward(self, src, trg):
        src_emb = self.pos_enc(self.src_embed(src))
        trg_emb = self.pos_enc(self.trg_embed(trg))
        out = self.transformer(src_emb, trg_emb, 
                               src_key_padding_mask=(src==0), 
                               tgt_key_padding_mask=(trg==0), 
                               memory_key_padding_mask=(src==0))
        return self.fc_out(out)

# -------------------------
# 5. Train function
# -------------------------
def train_model(model, train_loader, val_loader, optimizer, criterion, pad_idx, device, epochs=10, save_path="best_model.pt"):
    model.to(device)
    scaler = torch.amp.GradScaler(device=device)
    best_val_loss = float("inf")
    
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        start_time = time.time()
        for src, trg in train_loader:
            src, trg = src.to(device), trg.to(device)
            optimizer.zero_grad()
            with torch.amp.autocast(device_type=device.type):
                output = model(src, trg[:, :-1])
                loss = criterion(output.reshape(-1, output.size(-1)), trg[:, 1:].reshape(-1))
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            total_loss += loss.item()
        avg_train_loss = total_loss / len(train_loader)
        
        # validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for src, trg in val_loader:
                src, trg = src.to(device), trg.to(device)
                output = model(src, trg[:, :-1])
                loss = criterion(output.reshape(-1, output.size(-1)), trg[:, 1:].reshape(-1))
                val_loss += loss.item()
        avg_val_loss = val_loss / len(val_loader)
        
        print(f"Epoch {epoch+1}/{epochs} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | Time: {time.time()-start_time:.2f}s")
        
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save(model.state_dict(), save_path)
            print("Model saved!")

# -------------------------
# 6. Translate single sentence
# -------------------------
def translate_sentence(model, src_sentence, src_tokenizer, trg_tokenizer, device, max_len=50):
    model.eval()
    src_ids = [src_tokenizer.word2idx["<sos>"]] + src_tokenizer.encode(src_sentence) + [src_tokenizer.word2idx["<eos>"]]
    src_tensor = torch.tensor(src_ids, dtype=torch.long).unsqueeze(0).to(device)
    trg_ids = [trg_tokenizer.word2idx["<sos>"]]

    for _ in range(max_len):
        trg_tensor = torch.tensor(trg_ids, dtype=torch.long).unsqueeze(0).to(device)
        with torch.no_grad():
            output = model(src_tensor, trg_tensor)
        next_token = output[0, -1].argmax().item()
        trg_ids.append(next_token)
        if next_token == trg_tokenizer.word2idx["<eos>"]:
            break

    return trg_tokenizer.decode(trg_ids[1:-1])

# -------------------------
# 7. Evaluate on test set
# -------------------------
def evaluate_test_set(model, test_src, test_trg, src_tokenizer, trg_tokenizer, device, max_len=50, num_samples=None):
    model.eval()
    total_bleu = 0
    n = len(test_src) if num_samples is None else min(len(test_src), num_samples)
    for src_sentence, trg_sentence in zip(test_src[:n], test_trg[:n]):
        pred_sentence = translate_sentence(model, src_sentence, src_tokenizer, trg_tokenizer, device, max_len)
        bleu = sentence_bleu([trg_sentence.strip().lower().split()], pred_sentence.strip().lower().split())
        total_bleu += bleu
        print(f"Source: {src_sentence.strip()}")
        print(f"Target: {trg_sentence.strip()}")
        print(f"Predicted: {pred_sentence}")
        print(f"BLEU: {bleu:.4f}\n")
    avg_bleu = total_bleu / n
    print(f"Average BLEU score on test set: {avg_bleu:.4f}")

# -------------------------
# 8. Load data & prepare loaders
# -------------------------
with open("/kaggle/input/en-vi-ds/data/train.en", "r", encoding="utf-8") as f:
    train_en = f.readlines()
with open("/kaggle/input/en-vi-ds/data/train.vi", "r", encoding="utf-8") as f:
    train_vi = f.readlines()
with open("/kaggle/input/en-vi-ds/data/tst2013.en", "r", encoding="utf-8") as f:
    test_en = f.readlines()
with open("/kaggle/input/en-vi-ds/data/tst2013.vi", "r", encoding="utf-8") as f:
    test_vi = f.readlines()

src_tokenizer = Tokenizer(train_en)
trg_tokenizer = Tokenizer(train_vi)

dataset = TranslationDataset(train_en, train_vi, src_tokenizer, trg_tokenizer)
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=lambda b: collate_fn(b, pad_idx=0))
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, collate_fn=lambda b: collate_fn(b, pad_idx=0))

# -------------------------
# 9. Train model
# -------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TransformerModel(len(src_tokenizer.word2idx), len(trg_tokenizer.word2idx))
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss(ignore_index=0)




In [2]:

train_model(model, train_loader, val_loader, optimizer, criterion, pad_idx=0, device=device, epochs=12, save_path="best_model.pt")


  output = torch._nested_tensor_from_mask(


Epoch 1/12 | Train Loss: 2.5535 | Val Loss: 0.7299 | Time: 202.64s
Model saved!
Epoch 2/12 | Train Loss: 0.5657 | Val Loss: 0.3120 | Time: 202.60s
Model saved!
Epoch 3/12 | Train Loss: 0.2891 | Val Loss: 0.1931 | Time: 202.62s
Model saved!
Epoch 4/12 | Train Loss: 0.1778 | Val Loss: 0.1366 | Time: 203.16s
Model saved!
Epoch 5/12 | Train Loss: 0.1177 | Val Loss: 0.1083 | Time: 201.94s
Model saved!
Epoch 6/12 | Train Loss: 0.0816 | Val Loss: 0.0905 | Time: 199.86s
Model saved!
Epoch 7/12 | Train Loss: 0.0593 | Val Loss: 0.0794 | Time: 197.88s
Model saved!
Epoch 8/12 | Train Loss: 0.0442 | Val Loss: 0.0692 | Time: 198.72s
Model saved!
Epoch 9/12 | Train Loss: 0.0343 | Val Loss: 0.0637 | Time: 199.60s
Model saved!
Epoch 10/12 | Train Loss: 0.0270 | Val Loss: 0.0598 | Time: 199.36s
Model saved!
Epoch 11/12 | Train Loss: 0.0221 | Val Loss: 0.0548 | Time: 201.27s
Model saved!
Epoch 12/12 | Train Loss: 0.0178 | Val Loss: 0.0524 | Time: 202.66s
Model saved!


In [3]:
# -------------------------
# 10. Load best model & evaluate on test
# -------------------------
model.load_state_dict(torch.load("best_model.pt"))
evaluate_test_set(model, test_en, test_vi, src_tokenizer, trg_tokenizer, device, num_samples=50)  # đánh giá 50 câu đầu để nhanh


Source: When I was little , I thought my country was the best on the planet , and I grew up singing a song called &quot; Nothing To Envy . &quot;
Target: Khi tôi còn nhỏ , Tôi nghĩ rằng BắcTriều Tiên là đất nước tốt nhất trên thế giới và tôi thường hát bài &quot; Chúng ta chẳng có gì phải ghen tị . &quot;
Predicted: 7.200
BLEU: 0.0000

Source: And I was very proud .
Target: Tôi đã rất tự hào về đất nước tôi .
Predicted: 
BLEU: 0.0000

Source: In school , we spent a lot of time studying the history of Kim Il-Sung , but we never learned much about the outside world , except that America , South Korea , Japan are the enemies .
Target: Ở trường , chúng tôi dành rất nhiều thời gian để học về cuộc đời của chủ tịch Kim II- Sung , nhưng lại không học nhiều về thế giới bên ngoài , ngoại trừ việc Hoa Kỳ , Hàn Quốc và Nhật Bản là kẻ thù của chúng tôi .
Predicted: 
BLEU: 0.0000

Source: Although I often wondered about the outside world , I thought I would spend my entire life in North Korea , unti

In [4]:
sentence = "I love machine learning."
pred = translate_sentence(model, sentence, src_tokenizer, trg_tokenizer, device)
print(pred)





In [5]:
# -------------------------
# 11. Kiểm tra với câu mới
# -------------------------
example_sentences = [
    "I love programming in Python.",
    "The weather today is really nice.",
    "Machine learning is amazing."
]

for sent in example_sentences:
    translation = translate_sentence(model, sent, src_tokenizer, trg_tokenizer, device)
    print(f"Input: {sent}")
    print(f"Predicted translation: {translation}\n")


Input: I love programming in Python.
Predicted translation: 

Input: The weather today is really nice.
Predicted translation: 

Input: Machine learning is amazing.
Predicted translation: 7.200

