## Import libraries and model configuration

In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import numpy as np
import math, time, os, re, random
from collections import Counter
from tqdm import tqdm
import nltk
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
import pickle
import os

# Config
MODELNAME = "NMT_transformer.model"
EPOCH = 20
BATCHSIZE = 32
LR = 0.0005
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
D_MODEL = 256 
NHEAD = 8
NUM_ENCODER_LAYERS = 4 
NUM_DECODER_LAYERS = 4 
DIM_FEEDFORWARD = 1024
DROPOUT = 0.1 
MAX_SEQ_LENGTH = 100
LABEL_SMOOTHING = 0.1
WARMUP_STEPS = 4000

print(f"Using device: {DEVICE}")


Using device: cuda


## Load data

In [8]:
def load_file(path):
    with open(path, encoding="utf-8") as f:
        return [line.strip().split() for line in f]

# Load datasets
train_en = load_file("./IWSLT/train.en")
train_vi = load_file("./IWSLT/train.vi")
test_en  = load_file("./IWSLT/test.en")
test_vi  = load_file("./IWSLT/test.vi")

print(f"Data loaded: {len(train_en)} training pairs, {len(test_en)} test pairs")



Data loaded: 133317 training pairs, 1268 test pairs


## Tạo vocab và tiền xử lý dữ liệu

In [9]:
def make_vocab(train_data, min_freq):
    vocab = Counter(token for tokens in train_data for token in tokens)
    vocablist = [('<unk>', 0), ('<pad>', 0), ('<cls>', 0), ('<eos>', 0)]
    vocabidx = {tok: i for i, (tok, _) in enumerate(vocablist)}

    for token, freq in vocab.items():
        if freq >= min_freq:
            idx = len(vocablist)
            vocablist.append((token, freq))
            vocabidx[token] = idx
    return vocablist, vocabidx

vocablist_en, vocabidx_en = make_vocab(train_en, 3)
vocablist_vi, vocabidx_vi = make_vocab(train_vi, 3)

VOCAB_DIR = "saved_vocab"

os.makedirs(VOCAB_DIR, exist_ok=True)
with open(os.path.join(VOCAB_DIR, "vocab_en.pkl"), "wb") as f:
    pickle.dump({"list": vocablist_en, "idx": vocabidx_en}, f)
with open(os.path.join(VOCAB_DIR, "vocab_vi.pkl"), "wb") as f:
    pickle.dump({"list": vocablist_vi, "idx": vocabidx_vi}, f)

print(f"Vocab EN & VI saved to {VOCAB_DIR}")

def preprocess(data, vocabidx):
    return [['<cls>'] + [tok if tok in vocabidx else '<unk>' for tok in sent] + ['<eos>'] for sent in data]

train_en_prep = preprocess(train_en, vocabidx_en)
train_vi_prep = preprocess(train_vi, vocabidx_vi)
test_en_prep  = preprocess(test_en, vocabidx_en)


Vocab EN & VI saved to saved_vocab


## Chuẩn bị batch dữ liệu

In [10]:
train_data = list(zip(train_en_prep, train_vi_prep))
train_data.sort(key=lambda x: (len(x[0]), len(x[1])))
test_data = list(zip(test_en_prep, test_en, test_vi))

def padding_batch(batch):
    maxlen = max(len(seq) for seq in batch)
    for seq in batch:
        seq.extend(['<pad>'] * (maxlen - len(seq)))

for ben, bvi in train_data:
    padding_batch([ben])
    padding_batch([bvi])

train_data = [
    ([vocabidx_en[t] for t in ben], [vocabidx_vi[t] for t in bvi])
    for ben, bvi in train_data
]

test_data = [
    ([vocabidx_en[t] for t in enprep], en, vi)
    for enprep, en, vi in test_data
]


## Positional encoding và Label Smoothing

In [11]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=1000):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2], pe[:, 1::2] = torch.sin(position * div_term), torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(1))

    def forward(self, x):
        return self.dropout(x + self.pe[:x.size(0)])

class LabelSmoothingLoss(nn.Module):
    def __init__(self, classes, smoothing=0.0, dim=-1, ignore_index=None):
        super().__init__()
        self.confidence, self.smoothing, self.cls, self.dim, self.ignore_index = 1.0 - smoothing, smoothing, classes, dim, ignore_index

    def forward(self, pred, target):
        pred = pred.log_softmax(dim=self.dim)
        if self.smoothing == 0:
            return F.nll_loss(pred, target, ignore_index=self.ignore_index)
        with torch.no_grad():
            true_dist = torch.full_like(pred, self.smoothing / (self.cls - 1))
            true_dist.scatter_(1, target.unsqueeze(1), self.confidence)
            if self.ignore_index is not None:
                true_dist.masked_fill_(target.eq(self.ignore_index).unsqueeze(1), 0)
        loss = torch.sum(-true_dist * pred, dim=self.dim)
        if self.ignore_index is not None:
            non_pad = (~target.eq(self.ignore_index)).sum()
            return loss.sum() / non_pad
        return loss.mean()


## Transformer model

In [12]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model=256, nhead=8,
                 num_encoder_layers=4, num_decoder_layers=4, 
                 dim_feedforward=1024, dropout=0.1, max_len=1000):
        super().__init__()
        self.d_model = d_model
        self.src_embed = nn.Embedding(src_vocab_size, d_model, padding_idx=1)
        self.tgt_embed = nn.Embedding(tgt_vocab_size, d_model, padding_idx=1)
        self.pos_encoder = PositionalEncoding(d_model, dropout, max_len)
        enc_layer = nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, activation='gelu', norm_first=True)
        dec_layer = nn.TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout, activation='gelu', norm_first=True)
        self.encoder = nn.TransformerEncoder(enc_layer, num_encoder_layers)
        self.decoder = nn.TransformerDecoder(dec_layer, num_decoder_layers)
        self.layer_norm = nn.LayerNorm(d_model)
        self.fc_out = nn.Linear(d_model, tgt_vocab_size)
        self._init_weights()

    def _init_weights(self):
        for p in self.parameters():
            if p.dim() > 1: nn.init.xavier_uniform_(p)

    def forward(self, src, tgt, src_mask=None, tgt_mask=None, src_key_padding_mask=None, tgt_key_padding_mask=None):
        src = self.pos_encoder(self.src_embed(src) * math.sqrt(self.d_model))
        tgt = self.pos_encoder(self.tgt_embed(tgt) * math.sqrt(self.d_model))
        memory = self.encoder(src, mask=src_mask, src_key_padding_mask=src_key_padding_mask)
        output = self.decoder(tgt, memory, tgt_mask=tgt_mask, tgt_key_padding_mask=tgt_key_padding_mask, memory_key_padding_mask=src_key_padding_mask)
        return self.fc_out(self.layer_norm(output))

    def generate_square_subsequent_mask(self, sz):
        return torch.triu(torch.ones(sz, sz), diagonal=1).bool()


## Warmup Scheduler

In [13]:
class WarmupScheduler:
    def __init__(self, optimizer, d_model, warmup_steps):
        self.optimizer = optimizer
        self.d_model = d_model
        self.warmup_steps = warmup_steps
        self._step = 0

    def step(self):
        self._step += 1
        lr = self._get_lr()
        for p in self.optimizer.param_groups:
            p['lr'] = lr
        self.optimizer.step()

    def _get_lr(self):
        return (self.d_model ** -0.5) * min(
            self._step ** -0.5,
            self._step * (self.warmup_steps ** -1.5)
        )


## Tính BLEU Score

In [14]:
def evaluate_bleu(model, test_data, vocabidx_en, vocabidx_vi, vocablist_vi, max_len=50):
    model.eval()
    predictions, references = [], []
    inv_vocab_vi = {i: t for i, (t, _) in enumerate(vocablist_vi)}

    with torch.no_grad():
        for en_idx, en_tokens, vi_tokens in tqdm(test_data, desc="Evaluating BLEU"):
            src = torch.tensor(en_idx).unsqueeze(1).to(DEVICE)
            src_mask = None
            src_key_padding_mask = src.eq(vocabidx_en['<pad>']).T

            memory = model.encoder(model.pos_encoder(model.src_embed(src) * math.sqrt(model.d_model)),
                                   mask=src_mask, src_key_padding_mask=src_key_padding_mask)

            tgt_tokens = [vocabidx_vi['<cls>']]
            for _ in range(max_len):
                tgt = torch.tensor(tgt_tokens).unsqueeze(1).to(DEVICE)
                tgt_mask = model.generate_square_subsequent_mask(tgt.size(0)).to(DEVICE)
                out = model.decoder(model.pos_encoder(model.tgt_embed(tgt) * math.sqrt(model.d_model)),
                                    memory, tgt_mask=tgt_mask,
                                    memory_key_padding_mask=src_key_padding_mask)
                prob = model.fc_out(model.layer_norm(out))
                next_token = prob[-1].argmax(dim=-1).item()
                tgt_tokens.append(next_token)
                if next_token == vocabidx_vi['<eos>']:
                    break

            pred_sentence = [inv_vocab_vi[idx] for idx in tgt_tokens[1:-1]]
            predictions.append(pred_sentence)
            references.append([vi_tokens])

    smoothie = SmoothingFunction().method4
    bleu = corpus_bleu(references, predictions, smoothing_function=smoothie)
    return bleu, predictions


## Training loop

In [89]:
from torch.nn.utils.rnn import pad_sequence

def collate_batch(batch, pad_idx_src, pad_idx_tgt):
    src_batch, tgt_batch = [], []
    for src, tgt in batch:
        src_batch.append(torch.tensor(src, dtype=torch.long))
        tgt_batch.append(torch.tensor(tgt, dtype=torch.long))
    src_batch = pad_sequence(src_batch, padding_value=pad_idx_src)
    tgt_batch = pad_sequence(tgt_batch, padding_value=pad_idx_tgt)
    return src_batch, tgt_batch



def train_model(model, train_data, test_data, optimizer, scheduler, criterion, epochs):
    best_bleu = 0.0
    for epoch in range(1, epochs + 1):
        model.train()
        total_loss = 0
        random.shuffle(train_data)

        for i in tqdm(range(0, len(train_data), BATCHSIZE), desc=f"Epoch {epoch}"):
            batch = train_data[i:i+BATCHSIZE]
            src_batch, tgt_batch = collate_batch(
                batch, vocabidx_en['<pad>'], vocabidx_vi['<pad>']
            )

            src_batch, tgt_batch = src_batch.to(DEVICE), tgt_batch.to(DEVICE)
            tgt_input = tgt_batch[:-1, :]
            tgt_output = tgt_batch[1:, :]

            src_mask = None
            tgt_mask = model.generate_square_subsequent_mask(tgt_input.size(0)).to(DEVICE)

            src_key_padding_mask = src_batch.eq(vocabidx_en['<pad>']).T
            tgt_key_padding_mask = tgt_input.eq(vocabidx_vi['<pad>']).T

            optimizer.zero_grad()
            output = model(src_batch, tgt_input, src_mask, tgt_mask,
                           src_key_padding_mask, tgt_key_padding_mask)
            output = output.reshape(-1, output.size(-1))
            tgt_output = tgt_output.reshape(-1)

            loss = criterion(output, tgt_output)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            scheduler.step()
            total_loss += loss.item()

        avg_loss = total_loss / (len(train_data) / BATCHSIZE)
        bleu, _ = evaluate_bleu(model, test_data, vocabidx_en, vocabidx_vi, vocablist_vi)
        print(f"Epoch {epoch}: Loss={avg_loss:.4f}, BLEU={bleu*100:.2f}")

        if bleu > best_bleu:
            best_bleu = bleu
            torch.save({
                "model_state": model.state_dict(),
                "config": {
                    "src_vocab_size": len(vocablist_en),
                    "tgt_vocab_size": len(vocablist_vi),
                    "d_model": D_MODEL,
                    "nhead": NHEAD,
                    "num_encoder_layers": NUM_ENCODER_LAYERS,
                    "num_decoder_layers": NUM_DECODER_LAYERS,
                    "dim_feedforward": DIM_FEEDFORWARD,
                    "dropout": DROPOUT
                }
            }, MODELNAME)
            print(f"New best BLEU: {best_bleu*100:.2f}, best model saved!")

## Khởi tạo mô hình và Huấn luyện

In [15]:
model = Transformer(len(vocablist_en), len(vocablist_vi), D_MODEL, NHEAD,
                    NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS,
                    DIM_FEEDFORWARD, DROPOUT).to(DEVICE)



In [None]:
optimizer = optim.Adam(model.parameters(), lr=LR, betas=(0.9, 0.98), eps=1e-9)
scheduler = WarmupScheduler(optimizer, D_MODEL, WARMUP_STEPS)
criterion = LabelSmoothingLoss(len(vocablist_vi), smoothing=LABEL_SMOOTHING,
                               ignore_index=vocabidx_vi['<pad>'])

train_model(model, train_data, test_data, optimizer, scheduler, criterion, EPOCH)


Epoch 1: 100%|██████████| 4167/4167 [08:52<00:00,  7.83it/s]  
Evaluating BLEU: 100%|██████████| 1268/1268 [01:58<00:00, 10.71it/s]


Epoch 1: Loss=5.2593, BLEU=13.17
New best BLEU: 13.17, best model saved!


Epoch 2: 100%|██████████| 4167/4167 [08:37<00:00,  8.06it/s]  
Evaluating BLEU: 100%|██████████| 1268/1268 [02:03<00:00, 10.27it/s]


Epoch 2: Loss=3.8291, BLEU=20.74
New best BLEU: 20.74, best model saved!


Epoch 3: 100%|██████████| 4167/4167 [08:36<00:00,  8.06it/s]  
Evaluating BLEU: 100%|██████████| 1268/1268 [02:00<00:00, 10.50it/s]


Epoch 3: Loss=3.4488, BLEU=22.92
New best BLEU: 22.92, best model saved!


Epoch 4: 100%|██████████| 4167/4167 [09:02<00:00,  7.68it/s]  
Evaluating BLEU: 100%|██████████| 1268/1268 [02:07<00:00,  9.93it/s]


Epoch 4: Loss=3.2710, BLEU=24.12
New best BLEU: 24.12, best model saved!


Epoch 5: 100%|██████████| 4167/4167 [09:13<00:00,  7.53it/s]  
Evaluating BLEU: 100%|██████████| 1268/1268 [02:09<00:00,  9.78it/s]


Epoch 5: Loss=3.1579, BLEU=25.05
New best BLEU: 25.05, best model saved!


Epoch 6: 100%|██████████| 4167/4167 [09:14<00:00,  7.52it/s] 
Evaluating BLEU: 100%|██████████| 1268/1268 [02:13<00:00,  9.53it/s]


Epoch 6: Loss=3.0777, BLEU=25.38
New best BLEU: 25.38, best model saved!


Epoch 7: 100%|██████████| 4167/4167 [09:13<00:00,  7.53it/s]  
Evaluating BLEU: 100%|██████████| 1268/1268 [02:14<00:00,  9.43it/s]


Epoch 7: Loss=3.0141, BLEU=25.79
New best BLEU: 25.79, best model saved!


Epoch 8: 100%|██████████| 4167/4167 [09:11<00:00,  7.56it/s]  
Evaluating BLEU: 100%|██████████| 1268/1268 [02:11<00:00,  9.66it/s]


Epoch 8: Loss=2.9643, BLEU=25.87
New best BLEU: 25.87, best model saved!


Epoch 9: 100%|██████████| 4167/4167 [09:13<00:00,  7.53it/s]  
Evaluating BLEU: 100%|██████████| 1268/1268 [02:09<00:00,  9.78it/s]


Epoch 9: Loss=2.9214, BLEU=25.77


Epoch 10: 100%|██████████| 4167/4167 [09:12<00:00,  7.54it/s] 
Evaluating BLEU: 100%|██████████| 1268/1268 [02:13<00:00,  9.51it/s]


Epoch 10: Loss=2.8834, BLEU=26.38
New best BLEU: 26.38, best model saved!


Epoch 11: 100%|██████████| 4167/4167 [09:13<00:00,  7.53it/s]  
Evaluating BLEU: 100%|██████████| 1268/1268 [02:10<00:00,  9.74it/s]


Epoch 11: Loss=2.8519, BLEU=26.31


Epoch 12: 100%|██████████| 4167/4167 [09:13<00:00,  7.53it/s] 
Evaluating BLEU: 100%|██████████| 1268/1268 [02:12<00:00,  9.53it/s]


Epoch 12: Loss=2.8237, BLEU=26.12


Epoch 13: 100%|██████████| 4167/4167 [09:14<00:00,  7.51it/s]  
Evaluating BLEU: 100%|██████████| 1268/1268 [02:10<00:00,  9.75it/s]


Epoch 13: Loss=2.7981, BLEU=26.35


Epoch 14: 100%|██████████| 4167/4167 [09:11<00:00,  7.55it/s]  
Evaluating BLEU: 100%|██████████| 1268/1268 [02:15<00:00,  9.35it/s]


Epoch 14: Loss=2.7743, BLEU=26.46
New best BLEU: 26.46, best model saved!


Epoch 15: 100%|██████████| 4167/4167 [09:11<00:00,  7.55it/s]  
Evaluating BLEU: 100%|██████████| 1268/1268 [02:13<00:00,  9.49it/s]


Epoch 15: Loss=2.7523, BLEU=26.19


Epoch 16: 100%|██████████| 4167/4167 [09:13<00:00,  7.53it/s]  
Evaluating BLEU: 100%|██████████| 1268/1268 [02:16<00:00,  9.27it/s]


Epoch 16: Loss=2.7322, BLEU=26.27


Epoch 17: 100%|██████████| 4167/4167 [09:12<00:00,  7.54it/s]  
Evaluating BLEU: 100%|██████████| 1268/1268 [02:13<00:00,  9.47it/s]


Epoch 17: Loss=2.7138, BLEU=26.40


Epoch 18: 100%|██████████| 4167/4167 [09:11<00:00,  7.55it/s]  
Evaluating BLEU: 100%|██████████| 1268/1268 [02:13<00:00,  9.52it/s]


Epoch 18: Loss=2.6971, BLEU=26.48
New best BLEU: 26.48, best model saved!


Epoch 19: 100%|██████████| 4167/4167 [09:13<00:00,  7.52it/s] 
Evaluating BLEU: 100%|██████████| 1268/1268 [02:09<00:00,  9.77it/s]


Epoch 19: Loss=2.6812, BLEU=25.93


Epoch 20: 100%|██████████| 4167/4167 [09:12<00:00,  7.54it/s] 
Evaluating BLEU: 100%|██████████| 1268/1268 [02:08<00:00,  9.86it/s]

Epoch 20: Loss=2.6652, BLEU=26.43





## Kết quả cuối cùng

#### Kết quả BLEU score trên tập IWSLT

In [None]:
checkpoint = torch.load("NMT_transformer.model")
model.load_state_dict(checkpoint['model_state'])
bleu, predictions = evaluate_bleu(model, test_data, vocabidx_en, vocabidx_vi, vocablist_vi)

Evaluating BLEU: 100%|██████████| 1268/1268 [01:52<00:00, 11.27it/s]

Final BLEU Score: 26.48





In [20]:
print(f"Final BLEU Score on IWSLT data: {bleu*100:.2f}")

Final BLEU Score on IWSLT data: 26.48


#### Kết quả BLEU score trên tập data của thầy để so sánh với mô hình sau finetune

In [7]:
test_en_path  = "./Released Corpus/test.en.txt"
test_vi_path  = "./Released Corpus/test.vi.txt"

test_en_new  = load_file(test_en_path)
test_vi_new  = load_file(test_vi_path)

test_en_prep_new  = preprocess(test_en_new, vocabidx_en)
test_vi_prep_new  = preprocess(test_vi_new, vocabidx_vi)

test_data_new = [
    (
        [vocabidx_en[token] for token in en_tokens],
        en_original,
        vi_original
    )
    for en_tokens, en_original, vi_original in zip(test_en_prep_new, test_en_new, test_vi_new)
]

print(f"Loaded {len(test_data_new)} new test pairs for evaluation.")

Loaded 3000 new test pairs for evaluation.


In [21]:
model.load_state_dict(checkpoint['model_state'])
bleu, predictions = evaluate_bleu(model, test_data_new, vocabidx_en, vocabidx_vi, vocablist_vi)
print(f"Final BLEU Score on Released Corpus data: {bleu*100:.2f}")

Evaluating BLEU: 100%|██████████| 3000/3000 [05:14<00:00,  9.53it/s]


Final BLEU Score on Released Corpus data: 8.49
