In [None]:
import torch
import torch.nn as nn
import math
import os
from google.colab import drive

if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')

PROJECT_PATH = '/content/drive/MyDrive/NLP_Assignment_2025'
DATA_PATH = os.path.join(PROJECT_PATH, 'data')
CHECKPOINT_PATH = os.path.join(PROJECT_PATH, 'checkpoints')
MODEL_FILE = os.path.join(CHECKPOINT_PATH, 'transformer_best.pt')

if not os.path.exists(MODEL_FILE):
    print(f"ERROR: Khong tim thay file model tai {MODEL_FILE}")
else:
    print(f"Tim thay model tai: {MODEL_FILE}")


class Vocabulary:
    def __init__(self, freq_threshold=2):
        self.itos = {0: "<unk>", 1: "<pad>", 2: "<sos>", 3: "<eos>"}
        self.stoi = {"<unk>": 0, "<pad>": 1, "<sos>": 2, "<eos>": 3}
        self.freq_threshold = freq_threshold

    def __len__(self): return len(self.itos)

    def numericalize(self, text):
        return [self.stoi.get(token, self.stoi["<unk>"]) for token in text.lower().strip().split()]

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return x

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_head):
        super(MultiHeadAttention, self).__init__()
        self.d_head = d_model // n_head
        self.n_head = n_head
        self.d_model = d_model

        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)
        self.fc_out = nn.Linear(d_model, d_model)

    def forward(self, query, key, value, mask=None):
        batch_size = query.shape[0]
        Q = self.w_q(query).view(batch_size, -1, self.n_head, self.d_head).permute(0, 2, 1, 3)
        K = self.w_k(key).view(batch_size, -1, self.n_head, self.d_head).permute(0, 2, 1, 3)
        V = self.w_v(value).view(batch_size, -1, self.n_head, self.d_head).permute(0, 2, 1, 3)

        energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / math.sqrt(self.d_head)
        if mask is not None:
            energy = energy.masked_fill(mask == 0, -1e9)

        attention = torch.softmax(energy, dim=-1)
        x = torch.matmul(attention, V)
        x = x.permute(0, 2, 1, 3).contiguous().view(batch_size, -1, self.d_model)
        return self.fc_out(x)

class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionwiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()
    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

# --- Class Transformer Architecture ---
class EncoderLayer(nn.Module):
    def __init__(self, d_model, n_head, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, n_head)
        self.ffn = PositionwiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
    def forward(self, src, src_mask):
        _src = self.self_attn(src, src, src, src_mask)
        src = self.norm1(src + self.dropout(_src))
        _src = self.ffn(src)
        src = self.norm2(src + self.dropout(_src))
        return src

class DecoderLayer(nn.Module):
    def __init__(self, d_model, n_head, d_ff, dropout):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, n_head)
        self.cross_attn = MultiHeadAttention(d_model, n_head)
        self.ffn = PositionwiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
    def forward(self, trg, enc_src, trg_mask, src_mask):
        _trg = self.self_attn(trg, trg, trg, trg_mask)
        trg = self.norm1(trg + self.dropout(_trg))
        _trg = self.cross_attn(trg, enc_src, enc_src, src_mask)
        trg = self.norm2(trg + self.dropout(_trg))
        _trg = self.ffn(trg)
        trg = self.norm3(trg + self.dropout(_trg))
        return trg

class Encoder(nn.Module):
    def __init__(self, input_dim, d_model, n_layer, n_head, d_ff, dropout, max_len):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(input_dim, d_model)
        self.pos_encoding = PositionalEncoding(d_model, max_len)
        self.layers = nn.ModuleList([EncoderLayer(d_model, n_head, d_ff, dropout) for _ in range(n_layer)])
        self.dropout = nn.Dropout(dropout)
    def forward(self, src, src_mask):
        src = self.dropout(self.pos_encoding(self.embedding(src)))
        for layer in self.layers:
            src = layer(src, src_mask)
        return src

class Decoder(nn.Module):
    def __init__(self, output_dim, d_model, n_layer, n_head, d_ff, dropout, max_len):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(output_dim, d_model)
        self.pos_encoding = PositionalEncoding(d_model, max_len)
        self.layers = nn.ModuleList([DecoderLayer(d_model, n_head, d_ff, dropout) for _ in range(n_layer)])
        self.fc_out = nn.Linear(d_model, output_dim)
        self.dropout = nn.Dropout(dropout)
    def forward(self, trg, enc_src, trg_mask, src_mask):
        trg = self.dropout(self.pos_encoding(self.embedding(trg)))
        for layer in self.layers:
            trg = layer(trg, enc_src, trg_mask, src_mask)
        output = self.fc_out(trg)
        return output

class Transformer(nn.Module):
    def __init__(self, src_vocab_size, trg_vocab_size, d_model=256, n_head=8, n_layer=3, d_ff=512, dropout=0.1, max_len=100, src_pad_idx=1, trg_pad_idx=1):
        super(Transformer, self).__init__()
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.encoder = Encoder(src_vocab_size, d_model, n_layer, n_head, d_ff, dropout, max_len)
        self.decoder = Decoder(trg_vocab_size, d_model, n_layer, n_head, d_ff, dropout, max_len)

    def make_src_mask(self, src):
        return (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)

    def make_trg_mask(self, trg):
        trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(2)
        trg_len = trg.shape[1]
        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device=trg.device)).bool()
        return trg_pad_mask & trg_sub_mask

    def forward(self, src, trg):
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        enc_src = self.encoder(src, src_mask)
        output = self.decoder(trg, enc_src, trg_mask, src_mask)
        return output

UNK_IDX, PAD_IDX, SOS_IDX, EOS_IDX = 0, 1, 2, 3
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def translate_sentence(sentence, src_vocab, trg_vocab, model, device, max_len=50):
    model.eval()

    tokens = [token.lower() for token in sentence.split()]
    tokens = [SOS_IDX] + [src_vocab.stoi.get(token, UNK_IDX) for token in tokens] + [EOS_IDX]
    src_tensor = torch.LongTensor(tokens).unsqueeze(0).to(device)
    src_mask = model.make_src_mask(src_tensor)

    with torch.no_grad():
        enc_src = model.encoder(src_tensor, src_mask)

        trg_indices = [SOS_IDX]
        for i in range(max_len):
            trg_tensor = torch.LongTensor(trg_indices).unsqueeze(0).to(device)
            trg_mask = model.make_trg_mask(trg_tensor)

            output = model.decoder(trg_tensor, enc_src, trg_mask, src_mask)

            pred_token = output.argmax(2)[:,-1].item()
            trg_indices.append(pred_token)

            if pred_token == EOS_IDX:
                break

    trg_tokens = [trg_vocab.itos[i] for i in trg_indices]

    # Loai bo SOS va EOS khi in ra
    result = []
    for token in trg_tokens:
        if token not in ["<sos>", "<eos>", "<pad>"]:
            result.append(token)

    return " ".join(result)

print("Dang load Vocab va Model...")
try:
    src_vocab = torch.load(os.path.join(DATA_PATH, 'src_vocab.pth'), weights_only=False)
    trg_vocab = torch.load(os.path.join(DATA_PATH, 'trg_vocab.pth'), weights_only=False)

    INPUT_DIM = len(src_vocab)
    OUTPUT_DIM = len(trg_vocab)

    D_MODEL = 256
    N_HEAD = 8
    N_LAYER = 3
    D_FF = 512
    DROPOUT = 0.1
    MAX_LEN = 150

    model = Transformer(INPUT_DIM, OUTPUT_DIM, D_MODEL, N_HEAD, N_LAYER, D_FF, DROPOUT, MAX_LEN, PAD_IDX, PAD_IDX)

    if torch.cuda.is_available():
        map_location = torch.device('cuda')
    else:
        map_location = torch.device('cpu')

    state_dict = torch.load(MODEL_FILE, map_location=map_location, weights_only=False)
    model.load_state_dict(state_dict)

    model = model.to(DEVICE)
    print("Load Model thanh cong!")

    sentences = [
        "tôi là sinh viên",
        "hôm nay trời đẹp",
        "cảm ơn bạn rất nhiều",
        "tôi đi học bằng xe buýt"
    ]

    print("\n--- KET QUA DICH THU ---")
    for s in sentences:
        translated = translate_sentence(s, src_vocab, trg_vocab, model, DEVICE)
        print(f"Input : {s}")
        print(f"Output: {translated}")
        print("-" * 30)

except Exception as e:
    print(f"Loi xay ra: {e}")
    print("Hay kiem tra lai duong dan file trong Drive.")

Tim thay model tai: /content/drive/MyDrive/NLP_Assignment_2025/checkpoints/transformer_best.pt
Dang load Vocab va Model...
Load Model thanh cong!

--- KET QUA DICH THU ---
Input : tôi là sinh viên
Output: i was a student .
------------------------------
Input : hôm nay trời đẹp
Output: today &apos;s the sun .
------------------------------
Input : cảm ơn bạn rất nhiều
Output: thank you very much .
------------------------------
Input : tôi đi học bằng xe buýt
Output: i went to the bus .
------------------------------


In [None]:
import os
import tarfile

TEST_DATA_DIR = os.path.join(PROJECT_PATH, 'data', 'test_2013')
if not os.path.exists(TEST_DATA_DIR):
    os.makedirs(TEST_DATA_DIR)

url = "https://github.com/stefan-it/nmt-en-vi/raw/master/data/test-2013-en-vi.tgz"
tgz_path = os.path.join(TEST_DATA_DIR, "test-2013-en-vi.tgz")

print("Đang tải tập test IWSLT 2013...")
!wget -q {url} -O {tgz_path}

print("Đang giải nén...")
with tarfile.open(tgz_path, "r:gz") as tar:
    tar.extractall(path=TEST_DATA_DIR)

print(f"Xong! Dữ liệu nằm tại: {TEST_DATA_DIR}")
print("Các file có trong thư mục:")
!ls {TEST_DATA_DIR}

Đang tải tập test IWSLT 2013...
Đang giải nén...
Xong! Dữ liệu nằm tại: /content/drive/MyDrive/NLP_Assignment_2025/data/test_2013
Các file có trong thư mục:
test-2013-en-vi.tgz  tst2013.en  tst2013.vi


  tar.extractall(path=TEST_DATA_DIR)


In [None]:
!pip install -q sacrebleu tqdm

import sacrebleu
from tqdm import tqdm

TEST_SRC_FILE = os.path.join(TEST_DATA_DIR, 'tst2013.vi')
TEST_TRG_FILE = os.path.join(TEST_DATA_DIR, 'tst2013.en')

# --- 1. ĐỌC DỮ LIỆU ---
print("Đang đọc dữ liệu test...")
with open(TEST_SRC_FILE, 'r', encoding='utf-8') as f:
    test_src_sentences = [line.strip() for line in f.readlines()]

with open(TEST_TRG_FILE, 'r', encoding='utf-8') as f:
    test_trg_sentences = [line.strip() for line in f.readlines()]

print(f"Số lượng câu test: {len(test_src_sentences)}")

# --- 2. CHẠY INFERENCE (GREEDY SEARCH) ---
model.eval()

hypotheses = []
references = [test_trg_sentences]

print("Bắt đầu dịch (Phase 1 - Baseline)...")
for src_text in tqdm(test_src_sentences):
    pred = translate_sentence(src_text, src_vocab, trg_vocab, model, DEVICE)
    hypotheses.append(pred)

# --- 3. TÍNH BLEU SCORE ---
bleu = sacrebleu.corpus_bleu(hypotheses, references, tokenize='13a')

print("\n" + "="*40)
print(f"KET QUA PHASE 1 (BASELINE)")
print(f"BLEU SCORE: {bleu.score:.2f}")
print("="*40)

print("\n--- PHÂN TÍCH LỖI (ERROR ANALYSIS) ---")
print("Tìm các câu chứa ký tự lỗi HTML (&apos;, &quot;,...):\n")

count_errors = 0
for i in range(len(hypotheses)):
    if "&apos;" in hypotheses[i] or "&quot;" in hypotheses[i] or i < 5:
        print(f"ID  : {i}")
        print(f"Input : {test_src_sentences[i]}")
        print(f"Ref   : {test_trg_sentences[i]}")
        print(f"Pred  : {hypotheses[i]}")
        if "&apos;" in hypotheses[i]:
            print("-> PHÁT HIỆN LỖI: &apos;")
        print("-" * 30)
        count_errors += 1
        if count_errors >= 10: break

Đang đọc dữ liệu test...
Số lượng câu test: 1268
Bắt đầu dịch (Phase 1 - Baseline)...


100%|██████████| 1268/1268 [02:04<00:00, 10.19it/s]



KET QUA PHASE 1 (BASELINE)
BLEU SCORE: 11.69

--- PHÂN TÍCH LỖI (ERROR ANALYSIS) ---
Tìm các câu chứa ký tự lỗi HTML (&apos;, &quot;,...):

ID  : 0
Input : Khi tôi còn nhỏ , Tôi nghĩ rằng BắcTriều Tiên là đất nước tốt nhất trên thế giới và tôi thường hát bài &quot; Chúng ta chẳng có gì phải ghen tị . &quot;
Ref   : When I was little , I thought my country was the best on the planet , and I grew up singing a song called &quot; Nothing To Envy . &quot;
Pred  : when i was a little bit old , i think that the best world is the best nation in the world , and i &apos;m always not afraid that we should have to worry about war .
-> PHÁT HIỆN LỖI: &apos;
------------------------------
ID  : 1
Input : Tôi đã rất tự hào về đất nước tôi .
Ref   : And I was very proud .
Pred  : i was so proud of my country .
------------------------------
ID  : 2
Input : Ở trường , chúng tôi dành rất nhiều thời gian để học về cuộc đời của chủ tịch Kim II- Sung , nhưng lại không học nhiều về thế giới bên ngoài , ngo