In [5]:
!pip install pyvi spacy
!python -m spacy download en_core_web_sm

import os
import torch
import torch.nn as nn
import math
import time
import tarfile
import html
import random
import numpy as np
from collections import Counter
from torch.utils.data import DataLoader, Dataset
from google.colab import drive

from pyvi import ViTokenizer
import spacy

spacy_en = spacy.load('en_core_web_sm')

if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')

PROJECT_PATH = '/content/drive/MyDrive/NLP_Phase6_Final'
CHECKPOINT_PATH = os.path.join(PROJECT_PATH, 'checkpoints')
DATA_SAVE_PATH = os.path.join(PROJECT_PATH, 'data')

if not os.path.exists(CHECKPOINT_PATH): os.makedirs(CHECKPOINT_PATH)
if not os.path.exists(DATA_SAVE_PATH): os.makedirs(DATA_SAVE_PATH)

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Running on: {DEVICE}")

def download_and_clean():
    url = "https://github.com/stefan-it/nmt-en-vi/raw/master/data/train-en-vi.tgz"
    tgz_path = os.path.join(DATA_SAVE_PATH, "train-en-vi.tgz")

    if not os.path.exists(os.path.join(DATA_SAVE_PATH, "train.vi")):
        print("Downloading data...")
        os.system(f"wget -q {url} -O {tgz_path}")
        with tarfile.open(tgz_path, "r:gz") as tar:
            tar.extractall(path=DATA_SAVE_PATH)

    def clean_data(path):
        with open(path, 'r', encoding='utf-8') as f:
            return [html.unescape(line).replace('\xa0', ' ').strip() for line in f]

    src_lines = clean_data(os.path.join(DATA_SAVE_PATH, 'train.vi'))
    trg_lines = clean_data(os.path.join(DATA_SAVE_PATH, 'train.en'))
    return src_lines, trg_lines

src_raw, trg_raw = download_and_clean()
print(f"Loaded {len(src_raw)} sentences.")

def tokenize_vi(text):
    return ViTokenizer.tokenize(text).lower().split()

def tokenize_en(text):
    return [tok.text.lower() for tok in spacy_en.tokenizer(text)]
# -----------------------------

class Vocabulary:
    def __init__(self, freq_threshold=2, lang='vi'):
        self.itos = {0: "<pad>", 1: "<sos>", 2: "<eos>", 3: "<unk>"}
        self.stoi = {"<pad>": 0, "<sos>": 1, "<eos>": 2, "<unk>": 3}
        self.freq_threshold = freq_threshold
        self.lang = lang

    def __len__(self): return len(self.itos)

    def build_vocabulary(self, sentence_list):
        frequencies = Counter()
        for sentence in sentence_list:
            if self.lang == 'vi':
                tokens = tokenize_vi(sentence)
            else:
                tokens = tokenize_en(sentence)
            frequencies.update(tokens)

        idx = 4
        for word, freq in frequencies.items():
            if freq >= self.freq_threshold:
                self.stoi[word] = idx
                self.itos[idx] = word
                idx += 1

    def numericalize(self, text):
        if self.lang == 'vi':
            tokens = tokenize_vi(text)
        else:
            tokens = tokenize_en(text)
        return [self.stoi.get(token, self.stoi["<unk>"]) for token in tokens]

src_vocab = Vocabulary(freq_threshold=2, lang='vi')
trg_vocab = Vocabulary(freq_threshold=2, lang='en')
src_vocab.build_vocabulary(src_raw)
trg_vocab.build_vocabulary(trg_raw)
print(f"Vocab Size: Vi={len(src_vocab)}, En={len(trg_vocab)}")

class TranslationDataset(Dataset):
    def __init__(self, src_lines, trg_lines, src_vocab, trg_vocab, max_len=100):
        self.src_lines = src_lines
        self.trg_lines = trg_lines
        self.src_vocab = src_vocab
        self.trg_vocab = trg_vocab
        self.max_len = max_len

    def __len__(self): return len(self.src_lines)

    def __getitem__(self, idx):
        src_idx = [1] + self.src_vocab.numericalize(self.src_lines[idx]) + [2]
        trg_idx = [1] + self.trg_vocab.numericalize(self.trg_lines[idx]) + [2]
        return torch.tensor(src_idx[:self.max_len]), torch.tensor(trg_idx[:self.max_len])

def collate_fn(batch):
    src_batch, trg_batch = [], []
    for src, trg in batch:
        src_batch.append(src)
        trg_batch.append(trg)
    src_pad = torch.nn.utils.rnn.pad_sequence(src_batch, padding_value=0, batch_first=True)
    trg_pad = torch.nn.utils.rnn.pad_sequence(trg_batch, padding_value=0, batch_first=True)
    return src_pad, trg_pad

train_loader = DataLoader(TranslationDataset(src_raw, trg_raw, src_vocab, trg_vocab),
                          batch_size=32, shuffle=True, collate_fn=collate_fn)

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m109.2 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Running on: cuda
Loaded 133317 sentences.
Vocab Size: Vi=20724, En=28162


In [6]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return x

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_head):
        super(MultiHeadAttention, self).__init__()
        self.d_head = d_model // n_head
        self.n_head = n_head
        self.d_model = d_model

        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)
        self.fc_out = nn.Linear(d_model, d_model)

    def forward(self, query, key, value, mask=None):
        batch_size = query.shape[0]
        Q = self.w_q(query).view(batch_size, -1, self.n_head, self.d_head).permute(0, 2, 1, 3)
        K = self.w_k(key).view(batch_size, -1, self.n_head, self.d_head).permute(0, 2, 1, 3)
        V = self.w_v(value).view(batch_size, -1, self.n_head, self.d_head).permute(0, 2, 1, 3)

        energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / math.sqrt(self.d_head)
        if mask is not None:
            energy = energy.masked_fill(mask == 0, -1e9)

        attention = torch.softmax(energy, dim=-1)
        x = torch.matmul(attention, V)
        x = x.permute(0, 2, 1, 3).contiguous().view(batch_size, -1, self.d_model)
        return self.fc_out(x)

class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionwiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()
    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

class EncoderLayer(nn.Module):
    def __init__(self, d_model, n_head, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, n_head)
        self.ffn = PositionwiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src, src_mask):
        _src = self.self_attn(src, src, src, src_mask)
        src = self.norm1(src + self.dropout(_src))
        _src = self.ffn(src)
        src = self.norm2(src + self.dropout(_src))
        return src

class DecoderLayer(nn.Module):
    def __init__(self, d_model, n_head, d_ff, dropout):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, n_head)
        self.cross_attn = MultiHeadAttention(d_model, n_head)
        self.ffn = PositionwiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, trg, enc_src, trg_mask, src_mask):
        _trg = self.self_attn(trg, trg, trg, trg_mask)
        trg = self.norm1(trg + self.dropout(_trg))
        _trg = self.cross_attn(trg, enc_src, enc_src, src_mask)
        trg = self.norm2(trg + self.dropout(_trg))
        _trg = self.ffn(trg)
        trg = self.norm3(trg + self.dropout(_trg))
        return trg

class Transformer(nn.Module):
    def __init__(self, src_vocab_size, trg_vocab_size, d_model=256, n_head=8, n_layer=3, d_ff=512, dropout=0.1, max_len=150):
        super(Transformer, self).__init__()
        self.encoder = nn.ModuleList([EncoderLayer(d_model, n_head, d_ff, dropout) for _ in range(n_layer)])
        self.decoder = nn.ModuleList([DecoderLayer(d_model, n_head, d_ff, dropout) for _ in range(n_layer)])

        self.src_embedding = nn.Embedding(src_vocab_size, d_model)
        self.trg_embedding = nn.Embedding(trg_vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model, max_len)
        self.fc_out = nn.Linear(d_model, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.src_pad_idx = 0
        self.trg_pad_idx = 0

    def make_src_mask(self, src):
        return (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)

    def make_trg_mask(self, trg):
        trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(2)
        trg_len = trg.shape[1]
        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device=trg.device)).bool()
        return trg_pad_mask & trg_sub_mask

    def forward(self, src, trg):
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)

        src = self.dropout(self.pos_encoding(self.src_embedding(src)))
        for layer in self.encoder:
            src = layer(src, src_mask)

        trg = self.dropout(self.pos_encoding(self.trg_embedding(trg)))
        for layer in self.decoder:
            trg = layer(trg, src, trg_mask, src_mask)

        return self.fc_out(trg)

In [11]:
import torch
import os

# 1. Khởi tạo lại Model và các thành phần y như cũ
D_MODEL = 256
model = Transformer(len(src_vocab), len(trg_vocab), d_model=D_MODEL, n_head=8, n_layer=3).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)
scheduler = NoamScheduler(optimizer, d_model=D_MODEL, warmup_steps=4000)
criterion = LabelSmoothingLoss(len(trg_vocab), padding_idx=0, smoothing=0.1)

# 2. Đường dẫn file save
save_path = os.path.join(CHECKPOINT_PATH, 'phase6_safe_best.pt')

# 3. Load Checkpoint
if os.path.exists(save_path):
    print(f"--> Đang load checkpoint từ: {save_path}")
    state_dict = torch.load(save_path, map_location=DEVICE)
    model.load_state_dict(state_dict)
    print("--> Load model thành công!")
else:
    print("--> KHÔNG TÌM THẤY CHECKPOINT! Kiểm tra lại đường dẫn.")

START_EPOCH = 20
TOTAL_EPOCHS = 25


steps_finished = START_EPOCH * len(train_loader)
scheduler.step_num = steps_finished
print(f"--> Đã khôi phục Scheduler về step thứ: {steps_finished}")

print(f"Tiếp tục Training Phase 6 từ Epoch {START_EPOCH + 1}...")

for epoch in range(START_EPOCH, TOTAL_EPOCHS):
    model.train()
    epoch_loss = 0
    start_time = time.time()

    for src, trg in train_loader:
        src, trg = src.to(DEVICE), trg.to(DEVICE)

        trg_input = trg[:, :-1]
        trg_output = trg[:, 1:]

        optimizer.zero_grad()
        output = model(src, trg_input)

        output = output.contiguous().view(-1, output.shape[-1])
        trg_output = trg_output.contiguous().view(-1)

        loss = criterion(output, trg_output)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        epoch_loss += loss.item()

    print(f"Epoch {epoch+1:02} | Time: {(time.time()-start_time)/60:.1f}m | Loss: {epoch_loss/len(train_loader):.4f}")

    new_save_path = os.path.join(CHECKPOINT_PATH, 'phase6_epoch25.pt') # Tên file mới
    torch.save(model.state_dict(), new_save_path)
    print(f"--> Đã lưu checkpoint epoch {epoch+1} vào file mới")

--> Đang load checkpoint từ: /content/drive/MyDrive/NLP_Phase6_Final/checkpoints/phase6_safe_best.pt
--> Load model thành công!
--> Đã khôi phục Scheduler về step thứ: 83340
Tiếp tục Training Phase 6 từ Epoch 21...
Epoch 21 | Time: 6.2m | Loss: 1.3023
--> Đã lưu checkpoint epoch 21 vào file mới
Epoch 22 | Time: 6.2m | Loss: 1.2886
--> Đã lưu checkpoint epoch 22 vào file mới
Epoch 23 | Time: 6.1m | Loss: 1.2886
--> Đã lưu checkpoint epoch 23 vào file mới


KeyboardInterrupt: 

In [12]:
def beam_search_decode(model, src_sentence, src_vocab, trg_vocab, device, beam_width=3, max_len=50):
    model.eval()

    tokens = [1] + src_vocab.numericalize(src_sentence) + [2]
    src_tensor = torch.LongTensor(tokens).unsqueeze(0).to(device)
    src_mask = model.make_src_mask(src_tensor)

    with torch.no_grad():
        src_emb = model.dropout(model.pos_encoding(model.src_embedding(src_tensor)))
        enc_src = src_emb
        for layer in model.encoder:
            enc_src = layer(enc_src, src_mask)

    sequences = [([1], 0.0, False)]

    for _ in range(max_len):
        all_candidates = []

        for seq, score, finished in sequences:
            if finished:
                all_candidates.append((seq, score, True))
                continue

            trg_tensor = torch.LongTensor(seq).unsqueeze(0).to(device)
            trg_mask = model.make_trg_mask(trg_tensor)

            with torch.no_grad():
                trg_emb = model.dropout(model.pos_encoding(model.trg_embedding(trg_tensor)))
                out = trg_emb
                for layer in model.decoder:
                    out = layer(out, enc_src, trg_mask, src_mask)
                output = model.fc_out(out)

            prob = output[:, -1, :].log_softmax(dim=-1)

            topk_probs, topk_idxs = prob.topk(beam_width)

            for i in range(beam_width):
                token = topk_idxs[0][i].item()
                token_prob = topk_probs[0][i].item()

                new_seq = seq + [token]
                new_score = score + token_prob

                if token == 2: # 2 là <eos>
                    all_candidates.append((new_seq, new_score, True))
                else:
                    all_candidates.append((new_seq, new_score, False))

        ordered = sorted(all_candidates, key=lambda x: x[1] / (len(x[0])**0.7), reverse=True)
        sequences = ordered[:beam_width]

        if all([s[2] for s in sequences]):
            break

    best_seq = sequences[0][0]

    words = []
    for idx in best_seq:
        if idx not in [0, 1, 2, 3]:
            words.append(trg_vocab.itos[idx])

    decoded_sentence = " ".join(words)

    decoded_sentence = decoded_sentence.replace(" .", ".").replace(" ,", ",").replace(" ?", "?")

    return decoded_sentence

print("\n" + "="*50)
print("  KẾT QUẢ DỊCH THỬ NGHIỆM (BEAM SEARCH WIDTH=3)")
print("="*50)

test_sentences = [
    "Tôi là sinh viên",
    "Hôm nay trời đẹp",
    "Bạn có thích học lập trình không?",
    "Trí tuệ nhân tạo rất thú vị.",
    "Cảm ơn bạn đã giúp đỡ tôi.",
    "Tôi muốn đi du lịch."
]

model.eval()
for sent in test_sentences:
    pred = beam_search_decode(model, sent, src_vocab, trg_vocab, DEVICE, beam_width=3)
    print(f"Input : {sent}")
    print(f"Output: {pred}")
    print("-" * 50)


  KẾT QUẢ DỊCH THỬ NGHIỆM (BEAM SEARCH WIDTH=3)
Input : Tôi là sinh viên
Output: i ' m a student.
--------------------------------------------------
Input : Hôm nay trời đẹp
Output: this is beautiful today.
--------------------------------------------------
Input : Bạn có thích học lập trình không?
Output: do you like to learn to code?
--------------------------------------------------
Input : Trí tuệ nhân tạo rất thú vị.
Output: artificial intelligence is very interesting.
--------------------------------------------------
Input : Cảm ơn bạn đã giúp đỡ tôi.
Output: thank you for helping me.
--------------------------------------------------
Input : Tôi muốn đi du lịch.
Output: i want to travel.
--------------------------------------------------


In [14]:
import torch
import os
import sacrebleu
from tqdm import tqdm
import tarfile
import html

checkpoints_to_test = [
    {
        "name": "Model Epoch 20 (Safe Best)",
        "path": "/content/drive/MyDrive/NLP_Phase6_Final/checkpoints/phase6_safe_best.pt"
    },
    {
        "name": "Model Epoch 25 (Latest)",
        "path": "/content/drive/MyDrive/NLP_Phase6_Final/checkpoints/phase6_epoch25.pt"
    }
]

if not os.path.exists('tst2013.vi'):
    print("Downloading Test Data...")
    os.system("wget -q https://github.com/stefan-it/nmt-en-vi/raw/master/data/test-2013-en-vi.tgz -O test.tgz")
    with tarfile.open("test.tgz", "r:gz") as tar: tar.extractall()

def clean_file(path):
    with open(path, 'r', encoding='utf-8') as f:
        return [html.unescape(line).strip() for line in f]

test_src = clean_file('tst2013.vi')
test_ref = clean_file('tst2013.en')

sample_sentences = [
    "Tôi là sinh viên",
    "Trí tuệ nhân tạo rất thú vị.",
    "Cảm ơn bạn đã giúp đỡ tôi."
]

print("="*60)
print(f"BẮT ĐẦU SO SÁNH {len(checkpoints_to_test)} MODEL")
print("="*60)

for ckpt in checkpoints_to_test:
    ckpt_name = ckpt["name"]
    ckpt_path = ckpt["path"]

    print(f"\n>>> Đang load: {ckpt_name}")

    if not os.path.exists(ckpt_path):
        print(f" Lỗi: Không tìm thấy file tại {ckpt_path}")
        continue

    try:
        state_dict = torch.load(ckpt_path, map_location=DEVICE)
        model.load_state_dict(state_dict)
        model.eval() # Chuyển sang chế độ đánh giá
    except Exception as e:
        print(f" Lỗi khi load model: {e}")
        continue

    print(f"--- Dịch thử mẫu ({ckpt_name}) ---")
    for sent in sample_sentences:
        pred = beam_search_decode(model, sent, src_vocab, trg_vocab, DEVICE, beam_width=3)
        print(f"   Input: {sent}")
        print(f"   Pred : {pred}")

    print(f"--- Đang tính BLEU Score trên {len(test_src)} câu... ---")
    hypotheses = []
    # Dùng tqdm để hiện thanh tiến trình
    for sent in tqdm(test_src, desc=f"Evaluating {ckpt_name}"):
        pred = beam_search_decode(model, sent, src_vocab, trg_vocab, DEVICE, beam_width=3)
        hypotheses.append(pred)

    bleu = sacrebleu.corpus_bleu(hypotheses, [test_ref], tokenize='13a')

    print("-" * 40)
    print(f" KẾT QUẢ {ckpt_name.upper()}")
    print(f" BLEU Score: {bleu.score:.2f}")
    print("-" * 40)

print("\n=== HOÀN TẤT SO SÁNH ===")

BẮT ĐẦU SO SÁNH 2 MODEL

>>> Đang load: Model Epoch 20 (Safe Best)
--- Dịch thử mẫu (Model Epoch 20 (Safe Best)) ---
   Input: Tôi là sinh viên
   Pred : i ' m a student.
   Input: Trí tuệ nhân tạo rất thú vị.
   Pred : artificial intelligence is very interesting.
   Input: Cảm ơn bạn đã giúp đỡ tôi.
   Pred : thank you for helping me.
--- Đang tính BLEU Score trên 1268 câu... ---


Evaluating Model Epoch 20 (Safe Best): 100%|██████████| 1268/1268 [05:51<00:00,  3.61it/s]


----------------------------------------
 KẾT QUẢ MODEL EPOCH 20 (SAFE BEST)
 BLEU Score: 15.99
----------------------------------------

>>> Đang load: Model Epoch 25 (Latest)
--- Dịch thử mẫu (Model Epoch 25 (Latest)) ---
   Input: Tôi là sinh viên
   Pred : i ' m students.
   Input: Trí tuệ nhân tạo rất thú vị.
   Pred : artificial intelligence is interesting.
   Input: Cảm ơn bạn đã giúp đỡ tôi.
   Pred : thank you for helping me.
--- Đang tính BLEU Score trên 1268 câu... ---


Evaluating Model Epoch 25 (Latest): 100%|██████████| 1268/1268 [05:58<00:00,  3.54it/s]


----------------------------------------
 KẾT QUẢ MODEL EPOCH 25 (LATEST)
 BLEU Score: 16.18
----------------------------------------

=== HOÀN TẤT SO SÁNH ===
