# TẢI VÀ SETUP BỘ DỮ LIỆU small-PhoMT

In [1]:
!gdown --folder https://drive.google.com/drive/folders/186OAOuSEYEDVcry7WP5UBdqECXo26QAb -O PhoMT_data
!pip install torchmetrics

Retrieving folder contents
Processing file 1hoTd2hFwjSeFThlPm6YpN0NW5ePXS3Jc small-dev.json
Processing file 1_3L25SH1_jaEfOjpmpgnfMik4N3MxSyn small-test.json
Processing file 1-eG6FeF-v__rsf77iWurddahXbyjTYh5 small-train.json
Retrieving folder contents completed
Building directory structure
Building directory structure completed
Downloading...
From: https://drive.google.com/uc?id=1hoTd2hFwjSeFThlPm6YpN0NW5ePXS3Jc
To: /content/PhoMT_data/small-dev.json
100% 594k/594k [00:00<00:00, 32.4MB/s]
Downloading...
From: https://drive.google.com/uc?id=1_3L25SH1_jaEfOjpmpgnfMik4N3MxSyn
To: /content/PhoMT_data/small-test.json
100% 669k/669k [00:00<00:00, 36.9MB/s]
Downloading...
From: https://drive.google.com/uc?id=1-eG6FeF-v__rsf77iWurddahXbyjTYh5
To: /content/PhoMT_data/small-train.json
100% 5.68M/5.68M [00:00<00:00, 194MB/s]
Download completed
Collecting torchmetrics
  Downloading torchmetrics-1.8.2-py3-none-any.whl.metadata (22 kB)
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Down

In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import random
import json
import os
from collections import Counter
from torchmetrics.text.rouge import ROUGEScore
from tqdm import tqdm
import time

# Thiết lập device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


## XỬ LÝ DỮ LIỆU

In [4]:
def tokenize(text):
    return text.lower().split()

class Vocabulary:
    def __init__(self, freq_threshold=2):
        self.itos = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
        self.stoi = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
        self.freq_threshold = freq_threshold

    def __len__(self):
        return len(self.itos)

    def build_vocabulary(self, sentence_list):
        frequencies = Counter()
        idx = 4
        for sentence in sentence_list:
            for word in tokenize(sentence):
                frequencies[word] += 1
                if frequencies[word] == self.freq_threshold:
                    self.stoi[word] = idx
                    self.itos[idx] = word
                    idx += 1

    def numericalize(self, text):
        tokenized_text = tokenize(text)
        return [self.stoi.get(token, self.stoi["<UNK>"]) for token in tokenized_text]

class TranslationDataset(Dataset):
    def __init__(self, json_file, src_vocab=None, tgt_vocab=None, is_train=True):
        with open(json_file, 'r', encoding='utf-8') as f:
            self.data = json.load(f)

        # Sử dụng key chính xác: 'english' và 'vietnamese'
        self.src_data = [item['english'] for item in self.data]
        self.tgt_data = [item['vietnamese'] for item in self.data]

        if is_train:
            self.src_vocab = Vocabulary(freq_threshold=2)
            self.src_vocab.build_vocabulary(self.src_data)
            self.tgt_vocab = Vocabulary(freq_threshold=2)
            self.tgt_vocab.build_vocabulary(self.tgt_data)
        else:
            self.src_vocab = src_vocab
            self.tgt_vocab = tgt_vocab

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        src_text = self.src_data[index]
        tgt_text = self.tgt_data[index]

        src_indices = [self.src_vocab.stoi["<SOS>"]] + self.src_vocab.numericalize(src_text) + [self.src_vocab.stoi["<EOS>"]]
        tgt_indices = [self.tgt_vocab.stoi["<SOS>"]] + self.tgt_vocab.numericalize(tgt_text) + [self.tgt_vocab.stoi["<EOS>"]]
        return torch.tensor(src_indices), torch.tensor(tgt_indices)

class Collate:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx
    def __call__(self, batch):
        src = [item[0] for item in batch]
        tgt = [item[1] for item in batch]
        src = nn.utils.rnn.pad_sequence(src, batch_first=True, padding_value=self.pad_idx) # batch_first=True cho dễ nhìn
        tgt = nn.utils.rnn.pad_sequence(tgt, batch_first=True, padding_value=self.pad_idx)
        return src, tgt

# Setup Loaders
train_path = 'PhoMT_data/small-train.json'
test_path = 'PhoMT_data/small-test.json'

train_dataset = TranslationDataset(train_path, is_train=True)
test_dataset = TranslationDataset(test_path, src_vocab=train_dataset.src_vocab, tgt_vocab=train_dataset.tgt_vocab, is_train=False)

BATCH_SIZE = 64
pad_idx = train_dataset.src_vocab.stoi["<PAD>"]
loader_train = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=Collate(pad_idx))
loader_test = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=Collate(pad_idx))

print("Dữ liệu đã sẵn sàng!")

Dữ liệu đã sẵn sàng!


# **BÀI 1**

## XÂY DỰNG MÔ HÌNH ENCODER - DECODER 3 LỚP LSTM

In [5]:
# Định nghĩa Model
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hidden_dim, n_layers, dropout=dropout, batch_first=True)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        # src: [batch, seq_len]
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.rnn(embedded)
        return hidden, cell

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.output_dim = output_dim
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hidden_dim, n_layers, dropout=dropout, batch_first=True)
        self.fc_out = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):
        # input: [batch]
        input = input.unsqueeze(1) # [batch, 1]
        embedded = self.dropout(self.embedding(input))
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.fc_out(output.squeeze(1))
        return prediction, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, tgt, teacher_forcing_ratio=0.5):
        batch_size = src.shape[0]
        tgt_len = tgt.shape[1]
        tgt_vocab_size = self.decoder.output_dim

        outputs = torch.zeros(batch_size, tgt_len, tgt_vocab_size).to(self.device)
        hidden, cell = self.encoder(src)

        input = tgt[:, 0] # <SOS>

        for t in range(1, tgt_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[:, t, :] = output
            top1 = output.argmax(1)
            input = tgt[:, t] if random.random() < teacher_forcing_ratio else top1

        return outputs

# Khởi tạo Model
INPUT_DIM = len(train_dataset.src_vocab)
OUTPUT_DIM = len(train_dataset.tgt_vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HIDDEN_DIM = 256
N_LAYERS = 3
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HIDDEN_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HIDDEN_DIM, N_LAYERS, DEC_DROPOUT)
model = Seq2Seq(enc, dec, device).to(device)

optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

## HUẤN LUYỆN MÔ HÌNH

In [8]:
def train_fn(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    progress_bar = tqdm(iterator, desc="Training", unit="batch")

    for i, (src, tgt) in enumerate(progress_bar):
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()
        output = model(src, tgt)

        output_dim = output.shape[-1]
        output = output[:, 1:, :].reshape(-1, output_dim)
        tgt = tgt[:, 1:].reshape(-1)

        loss = criterion(output, tgt)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())

    return epoch_loss / len(iterator)

N_EPOCHS = 5
for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss = train_fn(model, loader_train, optimizer, criterion, 1)
    end_time = time.time()
    mins = int((end_time - start_time) / 60)
    secs = int((end_time - start_time) % 60)
    print(f'Epoch: {epoch+1:02} | Time: {mins}m {secs}s | Train Loss: {train_loss:.3f}')

=== BẮT ĐẦU TRAIN BÀI 1 ===


Training: 100%|██████████| 313/313 [01:34<00:00,  3.32batch/s, loss=6.14]


Epoch: 01 | Time: 1m 34s | Train Loss: 6.203


Training: 100%|██████████| 313/313 [01:30<00:00,  3.47batch/s, loss=5.99]


Epoch: 02 | Time: 1m 30s | Train Loss: 5.976


Training: 100%|██████████| 313/313 [01:33<00:00,  3.34batch/s, loss=5.82]


Epoch: 03 | Time: 1m 33s | Train Loss: 5.857


Training: 100%|██████████| 313/313 [01:32<00:00,  3.38batch/s, loss=5.81]


Epoch: 04 | Time: 1m 32s | Train Loss: 5.764


Training: 100%|██████████| 313/313 [01:32<00:00,  3.38batch/s, loss=5.72]

Epoch: 05 | Time: 1m 32s | Train Loss: 5.665





## ĐÁNH GIÁ MÔ HÌNH

In [9]:
def translate_sentence(model, sentence, src_vocab, tgt_vocab, device, max_length=50):
    model.eval()

    if isinstance(sentence, str):
        tokens = tokenize(sentence)
    else:
        tokens = sentence

    text_to_indices = [src_vocab.stoi["<SOS>"]] + [src_vocab.stoi.get(token, src_vocab.stoi["<UNK>"]) for token in tokens] + [src_vocab.stoi["<EOS>"]]
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

    with torch.no_grad():
        hidden, cell = model.encoder(sentence_tensor)

    outputs = [tgt_vocab.stoi["<SOS>"]]
    for _ in range(max_length):
        previous_word = torch.LongTensor([outputs[-1]]).to(device)
        with torch.no_grad():
            output, hidden, cell = model.decoder(previous_word, hidden, cell)
            best_guess = output.argmax(1).item()

        if best_guess == tgt_vocab.stoi["<EOS>"]:
            break
        outputs.append(best_guess)

    # Chuyển indices ngược lại thành từ
    translated_sentence = [tgt_vocab.itos[idx] for idx in outputs]
    return translated_sentence[1:] # Bỏ <SOS>

# Đánh giá ROUGE
def evaluate_rouge(model, iterator, src_vocab, tgt_vocab):
    model.eval()
    rouge = ROUGEScore()
    preds, targets = [], []

    with torch.no_grad():
        for src, tgt in tqdm(iterator, desc="Evaluating"):
            src = src.to(device)
            hidden, cell = model.encoder(src)
            batch_preds = []

            # Greedy Decode
            input = torch.tensor([tgt_vocab.stoi["<SOS>"]] * src.shape[0]).to(device)
            for _ in range(50): # Max length
                output, hidden, cell = model.decoder(input, hidden, cell)
                input = output.argmax(1)
                batch_preds.append(input.unsqueeze(1))

            batch_preds = torch.cat(batch_preds, dim=1) # [batch, len]

            # Convert to string
            for i in range(src.shape[0]):
                pred_tokens = [tgt_vocab.itos[idx.item()] for idx in batch_preds[i]]
                target_tokens = [tgt_vocab.itos[idx.item()] for idx in tgt[i] if idx.item() not in [0,1,2]] # Bỏ pad, sos, eos

                # Cắt ở EOS đầu tiên cho pred
                if "<EOS>" in pred_tokens:
                    pred_tokens = pred_tokens[:pred_tokens.index("<EOS>")]

                preds.append(" ".join(pred_tokens))
                targets.append(" ".join(target_tokens))

    scores = rouge(preds, targets)
    return scores

rouge_scores = evaluate_rouge(model, loader_test, train_dataset.src_vocab, train_dataset.tgt_vocab)
print(f"ROUGE-L Fmeasure: {rouge_scores['rougeL_fmeasure'].item():.4f}")

Đang tính ROUGE cho Bài 1...


Evaluating: 100%|██████████| 32/32 [00:02<00:00, 10.67it/s]


ROUGE-L Fmeasure: 0.1880


# **BÀI 2**

## XÂY DỰNG MÔ HÌNH ENCODER (3 LSTM) - DECODER (3LSTM), ATTENTION Bahdanau

In [13]:
class EncoderAttention(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        # batch_first=True
        self.rnn = nn.LSTM(emb_dim, hidden_dim, n_layers, dropout=dropout, batch_first=True)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        # src: [batch, seq_len]
        embedded = self.dropout(self.embedding(src))

        outputs, (hidden, cell) = self.rnn(embedded)

        return outputs, hidden, cell

class BahdanauAttention(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.W_q = nn.Linear(hidden_dim, hidden_dim, bias=False)
        self.W_k = nn.Linear(hidden_dim, hidden_dim, bias=False)
        self.v = nn.Linear(hidden_dim, 1, bias=False)

    def forward(self, decoder_hidden, encoder_outputs):
        # decoder_hidden: [batch, hidden] (top layer)
        # encoder_outputs: [batch, src_len, hidden]
        query = self.W_q(decoder_hidden).unsqueeze(1)
        keys = self.W_k(encoder_outputs)
        energy = torch.tanh(query + keys)
        scores = self.v(energy).squeeze(2)
        attention = torch.softmax(scores, dim=1)
        context = torch.bmm(attention.unsqueeze(1), encoder_outputs)
        return context

class DecoderAttention(nn.Module):
    def __init__(self, output_dim, emb_dim, hidden_dim, n_layers, dropout, attention):
        super().__init__()
        self.output_dim = output_dim
        self.attention = attention
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hidden_dim, n_layers, dropout=dropout, batch_first=True)
        self.fc_out = nn.Linear(hidden_dim * 2, output_dim) # Concat output + context
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell, encoder_outputs):
        input = input.unsqueeze(1)
        embedded = self.dropout(self.embedding(input))

        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))

        # Dùng hidden state lớp cuối cùng của decoder để tính attention
        context = self.attention(hidden[-1], encoder_outputs)

        combined = torch.cat((output, context), dim=2)
        prediction = self.fc_out(combined.squeeze(1))

        return prediction, hidden, cell

class Seq2SeqAttention(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, tgt, teacher_forcing_ratio=0.5):
        batch_size = src.shape[0]
        tgt_len = tgt.shape[1]
        tgt_vocab_size = self.decoder.output_dim

        outputs = torch.zeros(batch_size, tgt_len, tgt_vocab_size).to(self.device)
        encoder_outputs, hidden, cell = self.encoder(src)

        input = tgt[:, 0]

        for t in range(1, tgt_len):
            output, hidden, cell = self.decoder(input, hidden, cell, encoder_outputs)
            outputs[:, t, :] = output
            top1 = output.argmax(1)
            input = tgt[:, t] if random.random() < teacher_forcing_ratio else top1

        return outputs

# Hyperparameters
INPUT_DIM = len(train_dataset.src_vocab)
OUTPUT_DIM = len(train_dataset.tgt_vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HIDDEN_DIM = 256
N_LAYERS = 3
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

attn = BahdanauAttention(HIDDEN_DIM)
enc = EncoderAttention(INPUT_DIM, ENC_EMB_DIM, HIDDEN_DIM, N_LAYERS, ENC_DROPOUT)
dec = DecoderAttention(OUTPUT_DIM, DEC_EMB_DIM, HIDDEN_DIM, N_LAYERS, DEC_DROPOUT, attn)

model = Seq2SeqAttention(enc, dec, device).to(device)

optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

## HUẤN LUYỆN MÔ HÌNH

In [15]:
def train_fn_attn(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    progress_bar = tqdm(iterator, desc="Training Attention", unit="batch")

    for i, (src, tgt) in enumerate(progress_bar):
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()
        output = model(src, tgt)

        output_dim = output.shape[-1]
        output = output[:, 1:, :].reshape(-1, output_dim)
        tgt = tgt[:, 1:].reshape(-1)

        loss = criterion(output, tgt)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())

    return epoch_loss / len(iterator)

N_EPOCHS = 5
for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss = train_fn_attn(model, loader_train, optimizer, criterion, 1)
    end_time = time.time()
    mins = int((end_time - start_time) / 60)
    secs = int((end_time - start_time) % 60)
    print(f'Epoch: {epoch+1:02} | Time: {mins}m {secs}s | Train Loss: {train_loss:.3f}')

Training Attention: 100%|██████████| 313/313 [02:06<00:00,  2.48batch/s, loss=5.91]


Epoch: 01 | Time: 2m 6s | Train Loss: 6.136


Training Attention: 100%|██████████| 313/313 [02:09<00:00,  2.41batch/s, loss=5.82]


Epoch: 02 | Time: 2m 9s | Train Loss: 5.869


Training Attention: 100%|██████████| 313/313 [02:13<00:00,  2.34batch/s, loss=5.35]


Epoch: 03 | Time: 2m 13s | Train Loss: 5.621


Training Attention: 100%|██████████| 313/313 [02:08<00:00,  2.43batch/s, loss=5.51]


Epoch: 04 | Time: 2m 8s | Train Loss: 5.410


Training Attention: 100%|██████████| 313/313 [02:09<00:00,  2.43batch/s, loss=5.22]

Epoch: 05 | Time: 2m 9s | Train Loss: 5.239





## ĐÁNH GIÁ MÔ HÌNH

In [16]:
def evaluate_rouge_attn(model, iterator, src_vocab, tgt_vocab):
    model.eval()
    rouge = ROUGEScore()
    preds, targets = [], []

    with torch.no_grad():
        for src, tgt in tqdm(iterator, desc="Evaluating"):
            src = src.to(device)
            # Encoder trả về thêm encoder_outputs
            encoder_outputs, hidden, cell = model.encoder(src)
            batch_preds = []

            input = torch.tensor([tgt_vocab.stoi["<SOS>"]] * src.shape[0]).to(device)
            for _ in range(50):
                # Decoder nhận thêm encoder_outputs
                output, hidden, cell = model.decoder(input, hidden, cell, encoder_outputs)
                input = output.argmax(1)
                batch_preds.append(input.unsqueeze(1))

            batch_preds = torch.cat(batch_preds, dim=1)

            for i in range(src.shape[0]):
                pred_tokens = [tgt_vocab.itos[idx.item()] for idx in batch_preds[i]]
                target_tokens = [tgt_vocab.itos[idx.item()] for idx in tgt[i] if idx.item() not in [0,1,2]]

                if "<EOS>" in pred_tokens:
                    pred_tokens = pred_tokens[:pred_tokens.index("<EOS>")]

                preds.append(" ".join(pred_tokens))
                targets.append(" ".join(target_tokens))

    scores = rouge(preds, targets)
    return scores

rouge_scores = evaluate_rouge_attn(model, loader_test, train_dataset.src_vocab, train_dataset.tgt_vocab)
print(f"ROUGE-L Fmeasure: {rouge_scores['rougeL_fmeasure'].item():.4f}")

Evaluating: 100%|██████████| 32/32 [00:02<00:00, 11.36it/s]


ROUGE-L Fmeasure: 0.2922


# **BÀI 3**

## XÂY DỰNG MÔ HÌNH ENCODER - DECODER, ATTENTION Luong

In [17]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hidden_dim, n_layers, dropout=dropout, batch_first=True)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        # src: [batch, src len]
        embedded = self.dropout(self.embedding(src))

        # outputs: [batch, src len, hidden_dim]
        # hidden, cell: [n_layers, batch, hidden_dim]
        outputs, (hidden, cell) = self.rnn(embedded)

        return outputs, hidden, cell

class LuongAttention(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        # Luong Attention "General": score(h_t, h_s) = h_t^T * W * h_s
        self.W = nn.Linear(hidden_dim, hidden_dim, bias=False)

    def forward(self, decoder_hidden, encoder_outputs):
        # decoder_hidden: [batch, hidden_dim] (Trạng thái hiện tại của decoder)
        # encoder_outputs: [batch, src_len, hidden_dim]

        # query: [batch, 1, hidden_dim]
        query = self.W(decoder_hidden).unsqueeze(1)

        # Alignment Scores
        # [batch, 1, hidden] * [batch, src_len, hidden] (transpose) -> [batch, 1, src_len]
        scores = torch.bmm(query, encoder_outputs.transpose(1, 2))

        attention_weights = torch.softmax(scores, dim=-1) # [batch, 1, src_len]

        # Context Vector (Weighted Sum)
        # [batch, 1, src_len] * [batch, src_len, hidden] -> [batch, 1, hidden]
        context = torch.bmm(attention_weights, encoder_outputs)

        return context, attention_weights

class DecoderLuong(nn.Module):
    def __init__(self, output_dim, emb_dim, hidden_dim, n_layers, dropout, attention):
        super().__init__()
        self.output_dim = output_dim
        self.attention = attention

        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.dropout = nn.Dropout(dropout)

        self.rnn = nn.LSTM(emb_dim, hidden_dim, n_layers, dropout=dropout, batch_first=True)

        # Layer kết hợp Context và Hidden state (Concat Layer)
        # Input: [Hidden Decoder; Context Vector] -> Output: Hidden Size
        self.wc = nn.Linear(hidden_dim * 2, hidden_dim)

        # Layer dự đoán từ cuối cùng
        self.fc_out = nn.Linear(hidden_dim, output_dim)

    def forward(self, input, hidden, cell, encoder_outputs):
        # input: [batch]
        # hidden, cell: [n_layers, batch, hidden]
        # encoder_outputs: [batch, src_len, hidden]

        input = input.unsqueeze(1)
        embedded = self.dropout(self.embedding(input))

        # rnn_output: [batch, 1, hidden]
        rnn_output, (hidden, cell) = self.rnn(embedded, (hidden, cell))

        # rnn_output đang có dạng [batch, 1, hidden] -> squeeze -> [batch, hidden]
        context, attn_weights = self.attention(rnn_output.squeeze(1), encoder_outputs)
        # context: [batch, 1, hidden]

        # Luong Attention
        # cat: [batch, 1, hidden * 2]
        combined = torch.cat((rnn_output, context), dim=2)

        # Chiếu về hidden dim và qua Tanh
        h_tilde = torch.tanh(self.wc(combined)) # [batch, 1, hidden]

        prediction = self.fc_out(h_tilde.squeeze(1)) # [batch, output_dim]

        return prediction, hidden, cell

class Seq2SeqLuong(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, tgt, teacher_forcing_ratio=0.5):
        # src: [batch, src len]
        # tgt: [batch, tgt len]

        batch_size = src.shape[0]
        tgt_len = tgt.shape[1]
        tgt_vocab_size = self.decoder.output_dim

        outputs = torch.zeros(batch_size, tgt_len, tgt_vocab_size).to(self.device)

        # Encoder
        encoder_outputs, hidden, cell = self.encoder(src)

        # Decoder Input đầu tiên (<SOS>)
        input = tgt[:, 0]

        for t in range(1, tgt_len):
            output, hidden, cell = self.decoder(input, hidden, cell, encoder_outputs)

            outputs[:, t, :] = output

            # Teacher Forcing
            top1 = output.argmax(1)
            input = tgt[:, t] if random.random() < teacher_forcing_ratio else top1

        return outputs

# Hyperparameters
INPUT_DIM = len(train_dataset.src_vocab)
OUTPUT_DIM = len(train_dataset.tgt_vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HIDDEN_DIM = 256
N_LAYERS = 3
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

attn = LuongAttention(HIDDEN_DIM)
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HIDDEN_DIM, N_LAYERS, ENC_DROPOUT)
dec = DecoderLuong(OUTPUT_DIM, DEC_EMB_DIM, HIDDEN_DIM, N_LAYERS, DEC_DROPOUT, attn)

model = Seq2SeqLuong(enc, dec, device).to(device)

optimizer = optim.Adam(model.parameters(), lr=0.001)
pad_idx = train_dataset.tgt_vocab.stoi["<PAD>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

print(f'Mô hình Luong Attention có {sum(p.numel() for p in model.parameters() if p.requires_grad):,} tham số.')

Mô hình Luong Attention có 8,148,440 tham số.


## HUẤN LUYỆN MÔ HÌNH

In [20]:
def train_fn(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    progress_bar = tqdm(iterator, desc="Training Luong", unit="batch")

    for i, (src, tgt) in enumerate(progress_bar):
        src, tgt = src.to(device), tgt.to(device)

        optimizer.zero_grad()
        output = model(src, tgt)

        output_dim = output.shape[-1]
        output = output[:, 1:, :].reshape(-1, output_dim)
        tgt = tgt[:, 1:].reshape(-1)

        loss = criterion(output, tgt)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        epoch_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())

    return epoch_loss / len(iterator)

N_EPOCHS = 10
CLIP = 1

for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss = train_fn(model, loader_train, optimizer, criterion, CLIP)
    end_time = time.time()
    mins = int((end_time - start_time) / 60)
    secs = int((end_time - start_time) % 60)
    print(f'Epoch: {epoch+1:02} | Time: {mins}m {secs}s | Train Loss: {train_loss:.3f}')

Training Luong: 100%|██████████| 313/313 [01:51<00:00,  2.80batch/s, loss=5.6]


Epoch: 01 | Time: 1m 51s | Train Loss: 5.532


Training Luong: 100%|██████████| 313/313 [01:50<00:00,  2.84batch/s, loss=5.35]


Epoch: 02 | Time: 1m 50s | Train Loss: 5.390


Training Luong: 100%|██████████| 313/313 [01:51<00:00,  2.80batch/s, loss=5.26]


Epoch: 03 | Time: 1m 51s | Train Loss: 5.291


Training Luong: 100%|██████████| 313/313 [01:51<00:00,  2.82batch/s, loss=4.94]


Epoch: 04 | Time: 1m 51s | Train Loss: 5.189


Training Luong: 100%|██████████| 313/313 [01:51<00:00,  2.81batch/s, loss=5.06]


Epoch: 05 | Time: 1m 51s | Train Loss: 5.100


Training Luong: 100%|██████████| 313/313 [01:51<00:00,  2.81batch/s, loss=5.1]


Epoch: 06 | Time: 1m 51s | Train Loss: 5.018


Training Luong: 100%|██████████| 313/313 [01:51<00:00,  2.82batch/s, loss=4.78]


Epoch: 07 | Time: 1m 51s | Train Loss: 4.938


Training Luong: 100%|██████████| 313/313 [01:52<00:00,  2.78batch/s, loss=4.89]


Epoch: 08 | Time: 1m 52s | Train Loss: 4.869


Training Luong: 100%|██████████| 313/313 [01:54<00:00,  2.74batch/s, loss=4.81]


Epoch: 09 | Time: 1m 54s | Train Loss: 4.807


Training Luong: 100%|██████████| 313/313 [01:52<00:00,  2.79batch/s, loss=4.68]

Epoch: 10 | Time: 1m 52s | Train Loss: 4.742





## ĐÁNH GIÁ MÔ HÌNH

In [21]:
def evaluate_rouge(model, iterator, src_vocab, tgt_vocab):
    model.eval()
    rouge = ROUGEScore()
    preds, targets = [], []

    with torch.no_grad():
        for src, tgt in tqdm(iterator, desc="Evaluating"):
            src = src.to(device)
            encoder_outputs, hidden, cell = model.encoder(src)
            batch_preds = []

            input = torch.tensor([tgt_vocab.stoi["<SOS>"]] * src.shape[0]).to(device)

            for _ in range(50):
                output, hidden, cell = model.decoder(input, hidden, cell, encoder_outputs)
                input = output.argmax(1)
                batch_preds.append(input.unsqueeze(1))

            batch_preds = torch.cat(batch_preds, dim=1)

            for i in range(src.shape[0]):
                pred_tokens = [tgt_vocab.itos[idx.item()] for idx in batch_preds[i]]
                if "<EOS>" in pred_tokens:
                    pred_tokens = pred_tokens[:pred_tokens.index("<EOS>")]

                target_tokens = [tgt_vocab.itos[idx.item()] for idx in tgt[i] if idx.item() not in [0, 1, 2]]

                preds.append(" ".join(pred_tokens))
                targets.append(" ".join(target_tokens))

    scores = rouge(preds, targets)
    return scores

rouge_scores = evaluate_rouge(model, loader_test, train_dataset.src_vocab, train_dataset.tgt_vocab)
print(f"ROUGE-L Fmeasure: {rouge_scores['rougeL_fmeasure'].item():.4f}")

Evaluating: 100%|██████████| 32/32 [00:02<00:00, 11.98it/s]


ROUGE-L Fmeasure: 0.3431
