#### Bài 2: Xây dựng kiến trúc Encoder-Decoder gồm 3 lớp LSTM cho module encoder và 3 lớp LSTM cho module decoder, với hidden size là 256, cho bài toán dịch máy từ tiếng Anh sang tiếng Việt. Module decoder được trang bị kỹ thuật attention theo mô tả của nghiên cứu "[Neural Machine Translation by Jointly Learning to Align and Translate](https://arxiv.org/abs/1409.0473)". Huấn luyện mô hình này trên bộ dữ liệu PhoMT sử dụng Adam làm phương thức tối ưu tham số. Đánh giá độ hiệu quả của mô hình sử dụn độ đo ROUGE-L.

In [None]:
# Cài đặt gdown (nếu chưa có)
!pip install gdown --upgrade

import gdown

folder_id = "186OAOuSEYEDVcry7WP5UBdqECXo26QAb"
gdown.download_folder(id=folder_id, quiet=False, use_cookies=False)



Retrieving folder contents


Processing file 1hoTd2hFwjSeFThlPm6YpN0NW5ePXS3Jc small-dev.json
Processing file 1_3L25SH1_jaEfOjpmpgnfMik4N3MxSyn small-test.json
Processing file 1-eG6FeF-v__rsf77iWurddahXbyjTYh5 small-train.json


Retrieving folder contents completed
Building directory structure
Building directory structure completed
Downloading...
From: https://drive.google.com/uc?id=1hoTd2hFwjSeFThlPm6YpN0NW5ePXS3Jc
To: /content/small-PhoMT/small-dev.json
100%|██████████| 594k/594k [00:00<00:00, 121MB/s]
Downloading...
From: https://drive.google.com/uc?id=1_3L25SH1_jaEfOjpmpgnfMik4N3MxSyn
To: /content/small-PhoMT/small-test.json
100%|██████████| 669k/669k [00:00<00:00, 96.0MB/s]
Downloading...
From: https://drive.google.com/uc?id=1-eG6FeF-v__rsf77iWurddahXbyjTYh5
To: /content/small-PhoMT/small-train.json
100%|██████████| 5.68M/5.68M [00:00<00:00, 282MB/s]
Download completed


['/content/small-PhoMT/small-dev.json',
 '/content/small-PhoMT/small-test.json',
 '/content/small-PhoMT/small-train.json']

In [30]:
!pip install rouge_score



In [31]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import json
from typing import Tuple
import numpy as np
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer


In [32]:
class Vocab:
    def __init__(self):
        self.pad_idx = 0
        self.bos_idx = 1
        self.eos_idx = 2
        self.src_word2idx = {'<pad>': 0, '<bos>': 1, '<eos>': 2}
        self.tgt_word2idx = {'<pad>': 0, '<bos>': 1, '<eos>': 2}
        self.src_idx2word = {0: '<pad>', 1: '<bos>', 2: '<eos>'}
        self.tgt_idx2word = {0: '<pad>', 1: '<bos>', 2: '<eos>'}

    def build(self, pairs):
        src_id = 3
        tgt_id = 3
        for en, vi in pairs:
            for w in en.split():
                if w not in self.src_word2idx:
                    self.src_word2idx[w] = src_id
                    self.src_idx2word[src_id] = w
                    src_id += 1
            for w in vi.split():
                if w not in self.tgt_word2idx:
                    self.tgt_word2idx[w] = tgt_id
                    self.tgt_idx2word[tgt_id] = w
                    tgt_id += 1
        self.total_src_tokens = src_id
        self.total_tgt_tokens = tgt_id

In [33]:
class TranslationDataset(Dataset):
    def __init__(self, pairs, vocab):
        self.pairs = pairs
        self.vocab = vocab

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        en, vi = self.pairs[idx]
        en_ids = [self.vocab.bos_idx] + [self.vocab.src_word2idx.get(w, 0) for w in en.split()] + [self.vocab.eos_idx]
        vi_ids = [self.vocab.bos_idx] + [self.vocab.tgt_word2idx.get(w, 0) for w in vi.split()] + [self.vocab.eos_idx]
        return torch.tensor(en_ids), torch.tensor(vi_ids)

def collate_fn(batch):
    src, tgt = zip(*batch)
    src = nn.utils.rnn.pad_sequence(src, padding_value=0, batch_first=True)
    tgt = nn.utils.rnn.pad_sequence(tgt, padding_value=0, batch_first=True)
    return src, tgt

In [34]:
class Seq2seqLSTM(nn.Module):
    def __init__(self, d_model: int, n_layers: int, dropout: float, vocab: Vocab):
        super().__init__()
        self.vocab = vocab
        self.n_layers = n_layers
        self.d_model = d_model
        self.dim = 2 * d_model
        self.src_embedding = nn.Embedding(vocab.total_src_tokens, d_model, vocab.pad_idx)
        self.encoder = nn.LSTM(d_model, d_model, n_layers, batch_first=True, dropout=dropout, bidirectional=True)
        self.tgt_embedding = nn.Embedding(vocab.total_tgt_tokens, self.dim, vocab.pad_idx)
        self.decoder = nn.LSTM(self.dim, self.dim, n_layers, batch_first=True, dropout=dropout, bidirectional=False)
        self.attn_Wq = nn.Linear(self.dim, self.dim, bias=False)
        self.attn_Wk = nn.Linear(self.dim, self.dim, bias=False)
        self.attn_v = nn.Linear(self.dim, 1, bias=False)
        self.output_head = nn.Linear(self.dim * 2, vocab.total_tgt_tokens)
        self.loss = nn.CrossEntropyLoss(ignore_index=vocab.pad_idx)

    def get_encoder_states(self, embedded_x):
        enc_outputs, (enc_h, enc_c) = self.encoder(embedded_x)
        bs = enc_outputs.shape[0]
        enc_h = enc_h.reshape(self.n_layers, 2, bs, self.d_model).permute(0, 2, 1, 3).reshape(self.n_layers, bs, self.dim)
        enc_c = enc_c.reshape(self.n_layers, 2, bs, self.d_model).permute(0, 2, 1, 3).reshape(self.n_layers, bs, self.dim)
        return enc_outputs, enc_h, enc_c

    def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
        self.train()
        embedded_x = self.src_embedding(x)
        enc_outputs, enc_h, enc_c = self.get_encoder_states(embedded_x)
        bs, tgt_len = y.shape
        dec_hidden = enc_h
        dec_cell = enc_c
        logits = []
        for ith in range(tgt_len):
            y_ith = y[:, ith].unsqueeze(1)
            embedded_y = self.tgt_embedding(y_ith)
            dec_output, (dec_hidden, dec_cell) = self.decoder(embedded_y, (dec_hidden, dec_cell))
            query = dec_output.squeeze(1)
            context = self.aligning(query, enc_outputs)
            combined = torch.cat([query, context], dim=-1)
            logit = self.output_head(combined)
            logits.append(logit.unsqueeze(1))
        logits = torch.cat(logits, dim=1)
        loss = self.loss(logits.reshape(-1, self.vocab.total_tgt_tokens), y.reshape(-1))
        return loss

    def aligning(self, query: torch.Tensor, k_v: torch.Tensor):
        query = self.attn_Wq(query).unsqueeze(1)  # [bs, 1, dim]
        keys = self.attn_Wk(k_v)  # [bs, seq_len, dim]
        energy = torch.tanh(query + keys)  # [bs, seq_len, dim]
        scores = self.attn_v(energy).squeeze(2)  # [bs, seq_len]
        a = nn.functional.softmax(scores, dim=1)
        context = torch.bmm(a.unsqueeze(1), k_v).squeeze(1)
        return context

    def predict(self, x: torch.Tensor, max_len=100):
        self.eval()
        embedded_x = self.src_embedding(x)
        enc_outputs, enc_h, enc_c = self.get_encoder_states(embedded_x)
        bs = x.shape[0]
        dec_hidden = enc_h
        dec_cell = enc_c
        y_ith = torch.full((bs, 1), self.vocab.bos_idx, dtype=torch.long, device=x.device)
        outputs = []
        for _ in range(max_len):
            embedded_y = self.tgt_embedding(y_ith)
            dec_output, (dec_hidden, dec_cell) = self.decoder(embedded_y, (dec_hidden, dec_cell))
            query = dec_output.squeeze(1)
            context = self.aligning(query, enc_outputs)
            combined = torch.cat([query, context], dim=-1)
            logit = self.output_head(combined)
            y_ith = logit.argmax(dim=-1).unsqueeze(1)
            outputs.append(y_ith)
            if all(y_ith.squeeze(1) == self.vocab.eos_idx):
                break
        outputs = torch.cat(outputs, dim=1)
        return outputs

In [35]:
# Load data from JSON files
def load_data(path):
    with open(path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    pairs = [(item["english"], item["vietnamese"]) for item in data]
    return pairs

train_pairs = load_data("/content/small-PhoMT/small-train.json")
dev_pairs = load_data("/content/small-PhoMT/small-dev.json")
test_pairs = load_data("/content/small-PhoMT/small-test.json")

print(f"Train: {len(train_pairs)} câu")
print(f"Dev: {len(dev_pairs)} câu")
print(f"Test: {len(test_pairs)} câu")

Train: 20000 câu
Dev: 2000 câu
Test: 2000 câu


In [36]:
# Xây dựng vocab
vocab = Vocab()
vocab.build(train_pairs)

# Datasets và dataloaders
train_dataset = TranslationDataset(train_pairs, vocab)
dev_dataset = TranslationDataset(dev_pairs, vocab)
test_dataset = TranslationDataset(test_pairs, vocab)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
dev_loader = DataLoader(dev_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

In [37]:
# Khởi tạo mô hình
d_model = 128  # Để hidden size decoder = 256
n_layers = 5
dropout = 0.3
model = Seq2seqLSTM(d_model, n_layers, dropout, vocab)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = optim.Adam(model.parameters())

# Huấn luyện
epochs = 10
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for x, y in train_loader:
        x, y = x.to(device), y.to(device)
        loss = model(x, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch + 1}, Loss: {avg_loss:.4f}')



Epoch 1, Loss: 5.5989
Epoch 2, Loss: 3.6233
Epoch 3, Loss: 3.0000
Epoch 4, Loss: 2.7244
Epoch 5, Loss: 2.5474
Epoch 6, Loss: 2.4223
Epoch 7, Loss: 2.3173
Epoch 8, Loss: 2.2308
Epoch 9, Loss: 2.1361
Epoch 10, Loss: 2.0527


In [38]:
import nltk

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [39]:
# Đánh giá
def decode(ids, idx2word):
    sentence = []
    for id in ids:
        if id == 0: continue
        if id == 2: break
        sentence.append(idx2word.get(id, '<unk>'))
    return ' '.join(sentence)

bleu1, bleu2, bleu3, bleu4 = [], [], [], []
rouge1, rouge2, rougel = [], [], []
meteors = []

smoother = SmoothingFunction()
r_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

model.eval()
with torch.no_grad():
    for x, y in test_loader:
        x = x.to(device)
        preds = model.predict(x)
        for pred, ref in zip(preds.tolist(), y.tolist()):
            pred_str = decode(pred, vocab.tgt_idx2word)
            ref_str = decode(ref[1:], vocab.tgt_idx2word)  # Bỏ <bos>
            ref_list = ref_str.split()
            pred_list = pred_str.split()
            bleu1.append(sentence_bleu([ref_list], pred_list, weights=(1, 0, 0, 0), smoothing_function=smoother.method1))
            bleu2.append(sentence_bleu([ref_list], pred_list, weights=(0.5, 0.5, 0, 0), smoothing_function=smoother.method1))
            bleu3.append(sentence_bleu([ref_list], pred_list, weights=(1/3, 1/3, 1/3, 0), smoothing_function=smoother.method1))
            bleu4.append(sentence_bleu([ref_list], pred_list, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoother.method1))
            scores = r_scorer.score(ref_str, pred_str)
            rouge1.append(scores['rouge1'].fmeasure)
            rouge2.append(scores['rouge2'].fmeasure)
            rougel.append(scores['rougeL'].fmeasure)
            meteors.append(meteor_score([ref_list], pred_list))

print(f"BLEU@1: {np.mean(bleu1):.4f}")
print(f"BLEU@2: {np.mean(bleu2):.4f}")
print(f"BLEU@3: {np.mean(bleu3):.4f}")
print(f"BLEU@4: {np.mean(bleu4):.4f}")
print(f"ROUGE-1: {np.mean(rouge1):.4f}")
print(f"ROUGE-2: {np.mean(rouge2):.4f}")
print(f"ROUGE-L: {np.mean(rougel):.4f}")
print(f"Meteor: {np.mean(meteors):.4f}")

BLEU@1: 0.0006
BLEU@2: 0.0002
BLEU@3: 0.0001
BLEU@4: 0.0001
ROUGE-1: 0.0276
ROUGE-2: 0.0012
ROUGE-L: 0.0231
Meteor: 0.0027
