In [None]:
import torch
import torch.nn as nn
import math
import os
import html
from google.colab import drive

if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')

PROJECT_PATH = '/content/drive/MyDrive/NLP_Assignment_2025(phase2)'
DATA_PATH = os.path.join(PROJECT_PATH, 'data')
CHECKPOINT_PATH = os.path.join(PROJECT_PATH, 'checkpoints')
MODEL_FILE = os.path.join(CHECKPOINT_PATH, 'transformer_phase2_best.pt')

print(f"Checking model path: {MODEL_FILE}")
if not os.path.exists(MODEL_FILE):
    raise FileNotFoundError("Chưa tìm thấy file model Phase 2. Hãy kiểm tra lại đường dẫn!")
else:
    print("--> Đã tìm thấy Model Phase 2!")

Checking model path: /content/drive/MyDrive/NLP_Assignment_2025(phase2)/checkpoints/transformer_phase2_best.pt
--> Đã tìm thấy Model Phase 2!


In [2]:
# --- 1. CLEAN TEXT FUNCTION ---
def clean_text(text):
    text = html.unescape(text)
    text = text.replace('\xa0', ' ')
    return text.strip()

# --- 2. VOCABULARY ---
class Vocabulary:
    def __init__(self, freq_threshold=2):
        self.itos = {0: "<unk>", 1: "<pad>", 2: "<sos>", 3: "<eos>"}
        self.stoi = {"<unk>": 0, "<pad>": 1, "<sos>": 2, "<eos>": 3}
        self.freq_threshold = freq_threshold
    def __len__(self): return len(self.itos)
    def numericalize(self, text):
        return [self.stoi.get(token, self.stoi["<unk>"]) for token in text.lower().strip().split()]

# --- 3. MODEL ARCHITECTURE ---
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return x

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_head):
        super(MultiHeadAttention, self).__init__()
        self.d_head = d_model // n_head
        self.n_head = n_head
        self.d_model = d_model
        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)
        self.fc_out = nn.Linear(d_model, d_model)
    def forward(self, query, key, value, mask=None):
        batch_size = query.shape[0]
        Q = self.w_q(query).view(batch_size, -1, self.n_head, self.d_head).permute(0, 2, 1, 3)
        K = self.w_k(key).view(batch_size, -1, self.n_head, self.d_head).permute(0, 2, 1, 3)
        V = self.w_v(value).view(batch_size, -1, self.n_head, self.d_head).permute(0, 2, 1, 3)
        energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / math.sqrt(self.d_head)
        if mask is not None: energy = energy.masked_fill(mask == 0, -1e9)
        attention = torch.softmax(energy, dim=-1)
        x = torch.matmul(attention, V)
        x = x.permute(0, 2, 1, 3).contiguous().view(batch_size, -1, self.d_model)
        return self.fc_out(x)

class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionwiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()
    def forward(self, x): return self.fc2(self.relu(self.fc1(x)))

class EncoderLayer(nn.Module):
    def __init__(self, d_model, n_head, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, n_head)
        self.ffn = PositionwiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
    def forward(self, src, src_mask):
        _src = self.self_attn(src, src, src, src_mask)
        src = self.norm1(src + self.dropout(_src))
        _src = self.ffn(src)
        src = self.norm2(src + self.dropout(_src))
        return src

class DecoderLayer(nn.Module):
    def __init__(self, d_model, n_head, d_ff, dropout):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, n_head)
        self.cross_attn = MultiHeadAttention(d_model, n_head)
        self.ffn = PositionwiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
    def forward(self, trg, enc_src, trg_mask, src_mask):
        _trg = self.self_attn(trg, trg, trg, trg_mask)
        trg = self.norm1(trg + self.dropout(_trg))
        _trg = self.cross_attn(trg, enc_src, enc_src, src_mask)
        trg = self.norm2(trg + self.dropout(_trg))
        _trg = self.ffn(trg)
        trg = self.norm3(trg + self.dropout(_trg))
        return trg

class Encoder(nn.Module):
    def __init__(self, input_dim, d_model, n_layer, n_head, d_ff, dropout, max_len):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(input_dim, d_model)
        self.pos_encoding = PositionalEncoding(d_model, max_len)
        self.layers = nn.ModuleList([EncoderLayer(d_model, n_head, d_ff, dropout) for _ in range(n_layer)])
        self.dropout = nn.Dropout(dropout)
    def forward(self, src, src_mask):
        src = self.dropout(self.pos_encoding(self.embedding(src)))
        for layer in self.layers: src = layer(src, src_mask)
        return src

class Decoder(nn.Module):
    def __init__(self, output_dim, d_model, n_layer, n_head, d_ff, dropout, max_len):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(output_dim, d_model)
        self.pos_encoding = PositionalEncoding(d_model, max_len)
        self.layers = nn.ModuleList([DecoderLayer(d_model, n_head, d_ff, dropout) for _ in range(n_layer)])
        self.fc_out = nn.Linear(d_model, output_dim)
        self.dropout = nn.Dropout(dropout)
    def forward(self, trg, enc_src, trg_mask, src_mask):
        trg = self.dropout(self.pos_encoding(self.embedding(trg)))
        for layer in self.layers: trg = layer(trg, enc_src, trg_mask, src_mask)
        return self.fc_out(trg)

class Transformer(nn.Module):
    def __init__(self, src_vocab_size, trg_vocab_size, d_model=256, n_head=8, n_layer=3, d_ff=512, dropout=0.1, max_len=100, src_pad_idx=1, trg_pad_idx=1):
        super(Transformer, self).__init__()
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.encoder = Encoder(src_vocab_size, d_model, n_layer, n_head, d_ff, dropout, max_len)
        self.decoder = Decoder(trg_vocab_size, d_model, n_layer, n_head, d_ff, dropout, max_len)
    def make_src_mask(self, src): return (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
    def make_trg_mask(self, trg):
        trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(2)
        trg_len = trg.shape[1]
        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device=trg.device)).bool()
        return trg_pad_mask & trg_sub_mask
    def forward(self, src, trg):
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        enc_src = self.encoder(src, src_mask)
        output = self.decoder(trg, enc_src, trg_mask, src_mask)
        return output

In [3]:
# Constants
UNK_IDX, PAD_IDX, SOS_IDX, EOS_IDX = 0, 1, 2, 3
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def translate_sentence(sentence, src_vocab, trg_vocab, model, device, max_len=50):
    model.eval()

    sentence = clean_text(sentence)

    tokens = [token.lower() for token in sentence.split()]
    tokens = [SOS_IDX] + [src_vocab.stoi.get(token, UNK_IDX) for token in tokens] + [EOS_IDX]
    src_tensor = torch.LongTensor(tokens).unsqueeze(0).to(device)
    src_mask = model.make_src_mask(src_tensor)

    with torch.no_grad():
        enc_src = model.encoder(src_tensor, src_mask)
        trg_indices = [SOS_IDX]
        for i in range(max_len):
            trg_tensor = torch.LongTensor(trg_indices).unsqueeze(0).to(device)
            trg_mask = model.make_trg_mask(trg_tensor)
            output = model.decoder(trg_tensor, enc_src, trg_mask, src_mask)
            pred_token = output.argmax(2)[:,-1].item()
            trg_indices.append(pred_token)
            if pred_token == EOS_IDX: break

    trg_tokens = [trg_vocab.itos[i] for i in trg_indices]
    result = []
    for token in trg_tokens:
        if token not in ["<sos>", "<eos>", "<pad>"]:
            result.append(token)
    return " ".join(result)

print("Loading Phase 2 Resources...")
try:
    src_vocab = torch.load(os.path.join(DATA_PATH, 'src_vocab.pth'), weights_only=False)
    trg_vocab = torch.load(os.path.join(DATA_PATH, 'trg_vocab.pth'), weights_only=False)
    print(f"Vocab loaded. Vi size: {len(src_vocab)}, En size: {len(trg_vocab)}")

    model = Transformer(len(src_vocab), len(trg_vocab), 256, 8, 3, 512, 0.1, 150, PAD_IDX, PAD_IDX)

    state_dict = torch.load(MODEL_FILE, map_location=DEVICE, weights_only=False)
    model.load_state_dict(state_dict)
    model = model.to(DEVICE)
    print("Model Phase 2 loaded successfully!")

except Exception as e:
    print(f"Lỗi load model: {e}")

Loading Phase 2 Resources...
Vocab loaded. Vi size: 12517, En size: 29345
Model Phase 2 loaded successfully!


In [5]:
!pip install -q sacrebleu tqdm
import sacrebleu
import tarfile
import urllib.request
from tqdm import tqdm

# Cau hinh duong dan
TEST_DATA_DIR = os.path.join(PROJECT_PATH, 'data', 'test_2013')
tgz_path = os.path.join(TEST_DATA_DIR, "test-2013-en-vi.tgz")

# 1. Tai va giai nen du lieu
if not os.path.exists(TEST_DATA_DIR):
    os.makedirs(TEST_DATA_DIR)

if not os.path.exists(tgz_path):
    print("Downloading and extracting test data...")
    url = "https://github.com/stefan-it/nmt-en-vi/raw/master/data/test-2013-en-vi.tgz"
    try:
        urllib.request.urlretrieve(url, tgz_path)
        with tarfile.open(tgz_path, "r:gz") as tar:
            tar.extractall(path=TEST_DATA_DIR)
    except Exception as e:
        print(f"Error downloading data: {e}")
        if os.path.exists(tgz_path): os.remove(tgz_path)

# 2. Doc va lam sach du lieu
try:
    with open(os.path.join(TEST_DATA_DIR, 'tst2013.vi'), 'r', encoding='utf-8') as f:
        test_src = [clean_text(line) for line in f.readlines()]

    with open(os.path.join(TEST_DATA_DIR, 'tst2013.en'), 'r', encoding='utf-8') as f:
        test_trg = [clean_text(line) for line in f.readlines()]

    # 3. Chay Inference
    model.eval()
    hypotheses = []
    references = [test_trg]

    print(f"Evaluating Phase 2 on {len(test_src)} sentences...")
    for src_sent in tqdm(test_src):
        pred = translate_sentence(src_sent, src_vocab, trg_vocab, model, DEVICE)
        hypotheses.append(pred)

    # 4. Tinh BLEU
    bleu = sacrebleu.corpus_bleu(hypotheses, references, tokenize='13a')

    print("\n" + "="*30)
    print(f"PHASE 2 BLEU SCORE: {bleu.score:.2f}")
    print("="*30)

    # Kiem tra loi HTML
    errors = [s for s in hypotheses if "&apos;" in s or "&quot;" in s]
    if len(errors) == 0:
        print("HTML entities check: PASS (Clean output)")
    else:
        print(f"HTML entities check: FAIL ({len(errors)} errors found)")

    # In vi du so sanh
    print("\n--- EXAMPLES ---")
    for i in range(min(3, len(test_src))):
        print(f"SRC : {test_src[i]}")
        print(f"REF : {test_trg[i]}")
        print(f"PRED: {hypotheses[i]}")
        print("-" * 20)

except FileNotFoundError:
    print("Error: Test data files not found. Please check directory path.")

Downloading and extracting test data...


  tar.extractall(path=TEST_DATA_DIR)


Evaluating Phase 2 on 1268 sentences...


100%|██████████| 1268/1268 [01:58<00:00, 10.74it/s]



PHASE 2 BLEU SCORE: 12.23
HTML entities check: PASS (Clean output)

--- EXAMPLES ---
SRC : Khi tôi còn nhỏ , Tôi nghĩ rằng BắcTriều Tiên là đất nước tốt nhất trên thế giới và tôi thường hát bài " Chúng ta chẳng có gì phải ghen tị . "
REF : When I was little , I thought my country was the best on the planet , and I grew up singing a song called " Nothing To Envy . "
PRED: when i was a little bit , i think , the first <unk> country in the world and i usually sing , " we don 't have to have a lot of chicken . "
--------------------
SRC : Tôi đã rất tự hào về đất nước tôi .
REF : And I was very proud .
PRED: i was very proud of my country .
--------------------
SRC : Ở trường , chúng tôi dành rất nhiều thời gian để học về cuộc đời của chủ tịch Kim II- Sung , nhưng lại không học nhiều về thế giới bên ngoài , ngoại trừ việc Hoa Kỳ , Hàn Quốc và Nhật Bản là kẻ thù của chúng tôi .
REF : In school , we spent a lot of time studying the history of Kim Il-Sung , but we never learned much about th