## Import libraries and model configuration

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import numpy as np
import math
import random
import pickle
import os
from tqdm import tqdm
import nltk
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from collections import Counter


In [3]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

D_MODEL = 256 
NHEAD = 8
NUM_ENCODER_LAYERS = 4 
NUM_DECODER_LAYERS = 4 
DIM_FEEDFORWARD = 1024
DROPOUT = 0.1 

BATCHSIZE = 16
LR = 5e-5
EPOCH = 10
WARMUP_STEPS = 1000

VOCAB_DIR = "saved_vocab"
MODELNAME = "NMT_transformer.model"
FINETUNEMODELNAME = "NMT_transformer_finetune.model"

TRAIN_EN_PATH = "./Released Corpus/train.en.txt"
TRAIN_VI_PATH = "./Released Corpus/train.vi.txt"
TEST_EN_PATH  = "./Released Corpus/test.en.txt"
TEST_VI_PATH  = "./Released Corpus/test.vi.txt"


Using device: cuda


## Positional encoding và Label Smoothing

In [10]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=1000):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2], pe[:, 1::2] = torch.sin(position * div_term), torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(1))

    def forward(self, x):
        return self.dropout(x + self.pe[:x.size(0)])

class LabelSmoothingLoss(nn.Module):
    def __init__(self, classes, smoothing=0.0, dim=-1, ignore_index=None):
        super().__init__()
        self.confidence, self.smoothing, self.cls, self.dim, self.ignore_index = 1.0 - smoothing, smoothing, classes, dim, ignore_index

    def forward(self, pred, target):
        pred = pred.log_softmax(dim=self.dim)
        if self.smoothing == 0:
            return F.nll_loss(pred, target, ignore_index=self.ignore_index)
        with torch.no_grad():
            true_dist = torch.full_like(pred, self.smoothing / (self.cls - 1))
            true_dist.scatter_(1, target.unsqueeze(1), self.confidence)
            if self.ignore_index is not None:
                true_dist.masked_fill_(target.eq(self.ignore_index).unsqueeze(1), 0)
        loss = torch.sum(-true_dist * pred, dim=self.dim)
        if self.ignore_index is not None:
            non_pad = (~target.eq(self.ignore_index)).sum()
            return loss.sum() / non_pad
        return loss.mean()


## Transformer model

In [11]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model=256, nhead=8,
                 num_encoder_layers=4, num_decoder_layers=4, 
                 dim_feedforward=1024, dropout=0.1, max_len=1000):
        super().__init__()
        self.d_model = d_model
        self.src_embed = nn.Embedding(src_vocab_size, d_model, padding_idx=1)
        self.tgt_embed = nn.Embedding(tgt_vocab_size, d_model, padding_idx=1)
        self.pos_encoder = PositionalEncoding(d_model, dropout, max_len)
        enc_layer = nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, activation='gelu', norm_first=True)
        dec_layer = nn.TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout, activation='gelu', norm_first=True)
        self.encoder = nn.TransformerEncoder(enc_layer, num_encoder_layers)
        self.decoder = nn.TransformerDecoder(dec_layer, num_decoder_layers)
        self.layer_norm = nn.LayerNorm(d_model)
        self.fc_out = nn.Linear(d_model, tgt_vocab_size)
        self._init_weights()

    def _init_weights(self):
        for p in self.parameters():
            if p.dim() > 1: nn.init.xavier_uniform_(p)

    def forward(self, src, tgt, src_mask=None, tgt_mask=None, src_key_padding_mask=None, tgt_key_padding_mask=None):
        src = self.pos_encoder(self.src_embed(src) * math.sqrt(self.d_model))
        tgt = self.pos_encoder(self.tgt_embed(tgt) * math.sqrt(self.d_model))
        memory = self.encoder(src, mask=src_mask, src_key_padding_mask=src_key_padding_mask)
        output = self.decoder(tgt, memory, tgt_mask=tgt_mask, tgt_key_padding_mask=tgt_key_padding_mask, memory_key_padding_mask=src_key_padding_mask)
        return self.fc_out(self.layer_norm(output))

    def generate_square_subsequent_mask(self, sz):
        return torch.triu(torch.ones(sz, sz), diagonal=1).bool()

## Warmup Scheduler

In [6]:
class WarmupScheduler:
    def __init__(self, optimizer, d_model, warmup_steps):
        self.optimizer = optimizer
        self.d_model = d_model
        self.warmup_steps = warmup_steps
        self._step = 0

    def step(self):
        self._step += 1
        lr = self._get_lr()
        for p in self.optimizer.param_groups:
            p['lr'] = lr
        self.optimizer.step()

    def _get_lr(self):
        return (self.d_model ** -0.5) * min(
            self._step ** -0.5,
            self._step * (self.warmup_steps ** -1.5)
        )


## Load vocab và tiền xử lý dữ liệu

In [6]:
def load_vocab(vocab_dir, file_name_en, file_name_vi):
    with open(os.path.join(vocab_dir, file_name_en), "rb") as f:
        vocab_en_data = pickle.load(f)
        vocablist_en, vocabidx_en = vocab_en_data["list"], vocab_en_data["idx"]
    with open(os.path.join(vocab_dir, file_name_vi), "rb") as f:
        vocab_vi_data = pickle.load(f)
        vocablist_vi, vocabidx_vi = vocab_vi_data["list"], vocab_vi_data["idx"]
    return vocablist_en, vocabidx_en, vocablist_vi, vocabidx_vi

def save_vocab(vocablist_en, vocabidx_en, vocablist_vi, vocabidx_vi, vocab_dir):
    os.makedirs(vocab_dir, exist_ok=True)
    with open(os.path.join(vocab_dir, "vocab_en_new.pkl"), "wb") as f:
        pickle.dump({"list": vocablist_en, "idx": vocabidx_en}, f)
    with open(os.path.join(vocab_dir, "vocab_vi_new.pkl"), "wb") as f:
        pickle.dump({"list": vocablist_vi, "idx": vocabidx_vi}, f)
    print("Saved updated vocab to", vocab_dir)

In [4]:
def load_file(path):
    with open(path, encoding="utf-8") as f:
        return [line.strip().split() for line in f]

def collect_new_tokens_topk(train_en_new, train_vi_new, test_en_new, test_vi_new,
                            vocabidx_en, vocabidx_vi, top_k_en=5000, top_k_vi=5000):
    def count_tokens(data_lists):
        counter = Counter()
        for data in data_lists:
            for sent in data:
                counter.update(sent)
        return counter

    freq_en = count_tokens([train_en_new, test_en_new])
    freq_vi = count_tokens([train_vi_new, test_vi_new])

    new_en_tokens = [token for token in freq_en if token not in vocabidx_en]
    new_vi_tokens = [token for token in freq_vi if token not in vocabidx_vi]

    new_en_tokens_topk = sorted(new_en_tokens, key=lambda t: freq_en[t], reverse=True)[:top_k_en]
    new_vi_tokens_topk = sorted(new_vi_tokens, key=lambda t: freq_vi[t], reverse=True)[:top_k_vi]

    print(f"Selected top-{top_k_en} new English tokens:", len(new_en_tokens_topk))
    print(f"Selected top-{top_k_vi} new Vietnamese tokens:", len(new_vi_tokens_topk))
    return new_en_tokens_topk, new_vi_tokens_topk

def preprocess_with_special_tokens(data, vocabidx):
    rr = []
    for tokenlist in data:
        tkl = ['<cls>']
        for token in tokenlist:
            tkl.append(token if token in vocabidx else '<unk>')
        tkl.append('<eos>')
        rr.append(tkl)
    return rr

In [9]:
def padding_train_data(train_data):
    batched = []
    for i in range(0, len(train_data), BATCHSIZE):
        batch = train_data[i:i+BATCHSIZE]
        en_batch = [x[0] for x in batch]
        vi_batch = [x[1] for x in batch]

        maxlen_en = max(len(seq) for seq in en_batch)
        maxlen_vi = max(len(seq) for seq in vi_batch)

        en_batch = [seq + [vocabidx_en['<pad>']]*(maxlen_en - len(seq)) for seq in en_batch]
        vi_batch = [seq + [vocabidx_vi['<pad>']]*(maxlen_vi - len(seq)) for seq in vi_batch]

        batched.append((en_batch, vi_batch))
    return batched

In [10]:
def init_new_embeddings_with_avg(old_embedding, new_vocab_size):
    old_num_embeddings, embedding_dim = old_embedding.weight.size()
    device = old_embedding.weight.device  

    new_embedding = nn.Embedding(new_vocab_size, embedding_dim).to(device)
    new_embedding.weight.data[:old_num_embeddings] = old_embedding.weight.data

    # Tính mean embedding của các token cũ (trừ <pad>)
    mask = torch.arange(old_num_embeddings, device=device) != 0
    mean_vec = old_embedding.weight.data[mask].mean(dim=0)

    # Khởi tạo token mới bằng mean_vec + noise
    for i in range(old_num_embeddings, new_vocab_size):
        new_embedding.weight.data[i] = mean_vec + torch.randn(embedding_dim, device=device) * (embedding_dim ** -0.5)

    return new_embedding


In [11]:
def expand_output_layer_smart(old_fc, new_vocab_size, d_model):
    old_out_features, old_in_features = old_fc.weight.size()
    device = old_fc.weight.device  

    new_fc = nn.Linear(d_model, new_vocab_size).to(device)
    new_fc.weight.data[:old_out_features] = old_fc.weight.data
    new_fc.bias.data[:old_out_features] = old_fc.bias.data

    mean_w = old_fc.weight.data.mean(dim=0)
    for i in range(old_out_features, new_vocab_size):
        new_fc.weight.data[i] = mean_w + torch.randn(d_model, device=device) * (d_model ** -0.5)
        new_fc.bias.data[i] = 0.0

    return new_fc


## Hàm greedy decoding và đánh giá BLEU

In [18]:
def greedy_decode(model, src, src_mask, src_key_padding_mask,
                  max_len=60, start_symbol=None, end_symbol=None, device=DEVICE):
    model.eval()
    with torch.no_grad():
        # Encode
        src_emb = model.pos_encoder(model.src_embed(src) * math.sqrt(model.d_model))
        memory = model.encoder(src_emb, mask=src_mask, src_key_padding_mask=src_key_padding_mask)

        ys = torch.tensor([[start_symbol]], dtype=torch.long, device=device) 
        for i in range(max_len - 1):
            tgt_emb = model.pos_encoder(model.tgt_embed(ys) * math.sqrt(model.d_model))
            tgt_mask = model.generate_square_subsequent_mask(tgt_emb.size(0)).to(device)
            out = model.decoder(tgt_emb, memory, tgt_mask=tgt_mask,
                                tgt_key_padding_mask=None,
                                memory_key_padding_mask=src_key_padding_mask)
            out = model.fc_out(model.layer_norm(out)) 
            prob = F.log_softmax(out[-1, 0], dim=-1)  
            next_token = torch.argmax(prob).unsqueeze(0).unsqueeze(0)  
            ys = torch.cat([ys, next_token], dim=0)
            if next_token.item() == end_symbol:
                break

    return ys.squeeze(1).tolist() 


def evaluate_bleu_greedy(model, test_data, vocabidx_en, vocabidx_vi,
                         max_len=60, device=DEVICE, print_samples=False, num_samples=5):
    refs = []
    hyps = []
    idx2word_vi = {v: k for k, v in vocabidx_vi.items()}
    idx2word_en = {v: k for k, v in vocabidx_en.items()}

    start_symbol = vocabidx_vi.get('<sos>', vocabidx_vi.get('<cls>', None))
    end_symbol = vocabidx_vi.get('<eos>', vocabidx_vi.get('</s>', vocabidx_vi.get('<eos>', None)))
    pad_vi = vocabidx_vi.get('<pad>', None)
    pad_en = vocabidx_en.get('<pad>', None)

    assert start_symbol is not None and end_symbol is not None, "Start or end token not found in target vocab."

    sample_outputs = []

    for idx, item in enumerate(test_data):
        if len(item) == 2:
            en_idx, vi_idx = item
            vi_orig_tokens = [idx2word_vi[t] for t in vi_idx if t not in (start_symbol, end_symbol, pad_vi)]
            en_orig_tokens = [idx2word_en[t] for t in en_idx if t != pad_en]
        else:
            en_idx, en_orig_tokens, vi_orig_tokens = item

        src = torch.tensor(en_idx, dtype=torch.long, device=device).unsqueeze(1)
        src_mask = torch.zeros((src.size(0), src.size(0)), device=device).bool()
        src_key_padding_mask = (src.transpose(0, 1) == pad_en)

        out_tokens = greedy_decode(model, src, src_mask, src_key_padding_mask,
                                   max_len=max_len, start_symbol=start_symbol, end_symbol=end_symbol, device=device)
        hyp_tokens = [idx2word_vi[t] for t in out_tokens if t not in (start_symbol, end_symbol, pad_vi)]

        refs.append([vi_orig_tokens])
        hyps.append(hyp_tokens)

        if print_samples and idx < num_samples:
            sample_outputs.append((
                " ".join(en_orig_tokens),
                " ".join(hyp_tokens),
                " ".join(vi_orig_tokens)
            ))

    smoothie = SmoothingFunction().method4
    bleu_score = corpus_bleu(refs, hyps, smoothing_function=smoothie)

    if print_samples:
        print("\n--- Sample Translations ---")
        for i, (src_sent, hyp_sent, ref_sent) in enumerate(sample_outputs, 1):
            print(f"[{i}] EN : {src_sent}")
            print(f"    PRED: {hyp_sent}")
            print(f"    REF : {ref_sent}")
            print()

    return bleu_score, [" ".join(h) for h in hyps]



## Hàm finetune

In [13]:
def finetune(model, train_batches, test_data, optimizer, scheduler, criterion,
             vocabidx_en, vocabidx_vi,
             epochs, save_path,
             freeze_epochs, max_decoding_len):

    device = next(model.parameters()).device
    best_bleu = 0.0
    init_lr = optimizer.defaults.get('lr', 5e-5)

    # Freeze all except embeddings, fc_out, layernorm
    for p in model.parameters():
        p.requires_grad = False
    model.src_embed.weight.requires_grad = True
    model.tgt_embed.weight.requires_grad = True
    model.fc_out.weight.requires_grad = True
    model.fc_out.bias.requires_grad = True
    for name, p in model.named_parameters():
        if "layer_norm" in name or "layernorm" in name.lower():
            p.requires_grad = True

    # Rebuild optimizer for trainable params
    optimizer = optim.Adam((p for p in model.parameters() if p.requires_grad),
                           lr=init_lr, betas=(0.9, 0.98), eps=1e-9)
    scheduler = WarmupScheduler(optimizer, model.d_model,
                                getattr(scheduler, 'warmup_steps', WARMUP_STEPS))

    for epoch in range(1, epochs+1):
        model.train()
        total_loss = 0.0

        for en_batch, vi_batch in tqdm(train_batches, desc=f"Epoch {epoch}"):
            src = torch.tensor(en_batch, dtype=torch.long, device=device).transpose(0, 1)
            tgt = torch.tensor(vi_batch, dtype=torch.long, device=device).transpose(0, 1)

            tgt_input = tgt[:-1]
            tgt_output = tgt[1:]

            tgt_mask = model.generate_square_subsequent_mask(tgt_input.size(0)).to(device)
            src_key_padding_mask = (src.transpose(0, 1) == vocabidx_en['<pad>'])
            tgt_key_padding_mask = (tgt_input.transpose(0, 1) == vocabidx_vi['<pad>'])

            optimizer.zero_grad()
            output = model(src, tgt_input, tgt_mask=tgt_mask,
                           src_key_padding_mask=src_key_padding_mask,
                           tgt_key_padding_mask=tgt_key_padding_mask)

            loss = criterion(output.reshape(-1, output.size(-1)), tgt_output.reshape(-1))
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

            total_loss += loss.item()

        print(f"Epoch {epoch} - Loss: {total_loss/len(train_batches):.4f}")

        # Unfreeze all after freeze_epochs
        if epoch == freeze_epochs:
            print("=> Unfreezing all parameters")
            for p in model.parameters():
                p.requires_grad = True
            optimizer = optim.Adam(model.parameters(), lr=max(1e-6, init_lr/2),
                                   betas=(0.9, 0.98), eps=1e-9)
            scheduler = WarmupScheduler(optimizer, model.d_model,
                                        max(200, getattr(scheduler, 'warmup_steps', WARMUP_STEPS)//2))

        # Evaluate BLEU 
        bleu, _ = evaluate_bleu_greedy(
            model, test_data, vocabidx_en, vocabidx_vi,
            max_len=max_decoding_len, device=device
        )
        print(f"BLEU: {bleu*100:.2f}")

        if bleu > best_bleu:
            best_bleu = bleu
            torch.save({
                "model_state": model.state_dict(),
                "config": {
                    "src_vocab_size": len(vocabidx_en),
                    "tgt_vocab_size": len(vocabidx_vi),
                    "d_model": model.d_model,
                    "nhead": NHEAD,
                    "num_encoder_layers": NUM_ENCODER_LAYERS,
                    "num_decoder_layers": NUM_DECODER_LAYERS,
                    "dim_feedforward": DIM_FEEDFORWARD,
                    "dropout": DROPOUT
                }
            }, save_path)
            print(f"Saved best model with BLEU={best_bleu*100:.2f}")

## Thực hiện load data, vocab và xử lý

In [14]:
vocablist_en, vocabidx_en, vocablist_vi, vocabidx_vi = load_vocab(
    VOCAB_DIR, "vocab_en.pkl", "vocab_vi.pkl"
)

print("Loaded vocab sizes: EN =", len(vocablist_en), "VI =", len(vocablist_vi))

# load released corpus
train_en_new = load_file(TRAIN_EN_PATH)
train_vi_new = load_file(TRAIN_VI_PATH)
test_en_new  = load_file(TEST_EN_PATH)
test_vi_new  = load_file(TEST_VI_PATH)

# collect new tokens top-k
new_en_tokens, new_vi_tokens = collect_new_tokens_topk(
    train_en_new, train_vi_new, test_en_new, test_vi_new,
    vocabidx_en, vocabidx_vi,
    top_k_en=5000, top_k_vi=5000
)

for token in new_en_tokens:
    vocabidx_en[token] = len(vocablist_en)
    vocablist_en.append(token)
for token in new_vi_tokens:
    vocabidx_vi[token] = len(vocablist_vi)
    vocablist_vi.append(token)

print("After expansion: EN =", len(vocablist_en), "VI =", len(vocablist_vi))
save_vocab(vocablist_en, vocabidx_en, vocablist_vi, vocabidx_vi, VOCAB_DIR)

# preprocess to tokens with special tokens
train_en_prep_new = preprocess_with_special_tokens(train_en_new, vocabidx_en)
train_vi_prep_new = preprocess_with_special_tokens(train_vi_new, vocabidx_vi)
test_en_prep_new  = preprocess_with_special_tokens(test_en_new, vocabidx_en)
test_vi_prep_new  = preprocess_with_special_tokens(test_vi_new, vocabidx_vi)

# convert to indices 
train_data_new = [
    (
        [vocabidx_en[token] for token in en_tokens],
        [vocabidx_vi[token] for token in vi_tokens]
    )
    for en_tokens, vi_tokens in zip(train_en_prep_new, train_vi_prep_new)
]
print("Loaded", len(train_data_new), "training pairs")

# convert to indices and keep original tokens for reference
test_data_new = [
    (
        [vocabidx_en[token] for token in en_tokens],
        en_original,
        vi_original
    )
    for en_tokens, en_original, vi_original in zip(test_en_prep_new, test_en_new, test_vi_new)
]
print("Loaded", len(test_data_new), "test pairs")

train_batches_new = padding_train_data(train_data_new)
print("Padded into", len(train_batches_new), "batches")

Loaded vocab sizes: EN = 24420 VI = 10666
Selected top-5000 new English tokens: 5000
Selected top-5000 new Vietnamese tokens: 5000
After expansion: EN = 29420 VI = 15666
Saved updated vocab to saved_vocab
Loaded 500000 training pairs
Loaded 3000 test pairs
Padded into 31250 batches


## Load model đã pretrain và finetune

In [None]:
checkpoint = torch.load(MODELNAME, map_location=DEVICE)
config = checkpoint.get("config", None)

model = Transformer(config["src_vocab_size"], config["tgt_vocab_size"], config["d_model"], config["nhead"],
                    config["num_encoder_layers"], config["num_decoder_layers"],
                    config["dim_feedforward"], config["dropout"]).to(DEVICE)

state = checkpoint.get("model_state", checkpoint)
model.load_state_dict(state, strict=False)
print("Loaded pretrained model (strict=False)")

# expand embeddings, output layer if needed using smart init
old_src_vocab_size = model.src_embed.num_embeddings
old_tgt_vocab_size = model.tgt_embed.num_embeddings

if len(vocablist_en) > old_src_vocab_size:
    print(f"Expanding source embedding: {old_src_vocab_size} -> {len(vocablist_en)}")
    model.src_embed = init_new_embeddings_with_avg(model.src_embed, len(vocablist_en)).to(DEVICE)

if len(vocablist_vi) > old_tgt_vocab_size:
    print(f"Expanding target embedding: {old_tgt_vocab_size} -> {len(vocablist_vi)}")
    model.tgt_embed = init_new_embeddings_with_avg(model.tgt_embed, len(vocablist_vi)).to(DEVICE)

# expand output layer smartly
old_fc_out_weight = model.fc_out.weight.data
old_out_features, embedding_dim = old_fc_out_weight.size()
if len(vocablist_vi) > old_out_features:
    print(f"Expanding fc_out: {old_out_features} -> {len(vocablist_vi)}")
    model.fc_out = expand_output_layer_smart(model.fc_out, len(vocablist_vi), D_MODEL).to(DEVICE)

# optimizer, scheduler, criterion for finetune
FINETUNE_LR = 5e-5
FINETUNE_WARMUP = 1000
FREEZE_EPOCHS = 5
MAX_DECODE_LEN = 60

# create a dummy optimizer/scheduler to pass structure to finetune
_dummy_optimizer = optim.Adam(model.parameters(), lr=FINETUNE_LR, betas=(0.9, 0.98), eps=1e-9)
_dummy_scheduler = WarmupScheduler(_dummy_optimizer, D_MODEL, FINETUNE_WARMUP)
criterion = LabelSmoothingLoss(len(vocablist_vi), smoothing=0.05, ignore_index=vocabidx_vi['<pad>'])





Loaded pretrained model (strict=False)
Expanding source embedding: 24420 -> 29420
Expanding target embedding: 10666 -> 15666
Expanding fc_out: 10666 -> 15666


In [32]:
finetune(model, train_batches_new, test_data_new, _dummy_optimizer, _dummy_scheduler, criterion,
             vocabidx_en, vocabidx_vi,
             EPOCH, FINETUNEMODELNAME,
             FREEZE_EPOCHS, MAX_DECODE_LEN)


Epoch 1: 100%|██████████| 31250/31250 [15:59<00:00, 32.56it/s]  


Epoch 1 - Loss: 3.4822
BLEU: 23.60
Saved best model with BLEU=23.60


Epoch 2: 100%|██████████| 31250/31250 [14:41<00:00, 35.44it/s]


Epoch 2 - Loss: 3.1838
BLEU: 24.95
Saved best model with BLEU=24.95


Epoch 3: 100%|██████████| 31250/31250 [14:48<00:00, 35.17it/s]


Epoch 3 - Loss: 3.1132
BLEU: 25.36
Saved best model with BLEU=25.36


Epoch 4: 100%|██████████| 31250/31250 [14:43<00:00, 35.35it/s]


Epoch 4 - Loss: 3.0765
BLEU: 25.30


Epoch 5: 100%|██████████| 31250/31250 [14:44<00:00, 35.33it/s]


Epoch 5 - Loss: 3.0543
=> Unfreezing all parameters
BLEU: 25.59
Saved best model with BLEU=25.59


Epoch 6: 100%|██████████| 31250/31250 [18:04<00:00, 28.82it/s]


Epoch 6 - Loss: 3.4589
BLEU: 22.37


Epoch 7: 100%|██████████| 31250/31250 [18:04<00:00, 28.82it/s]


Epoch 7 - Loss: 3.0875
BLEU: 24.51


Epoch 8: 100%|██████████| 31250/31250 [18:05<00:00, 28.80it/s]


Epoch 8 - Loss: 2.9816
BLEU: 25.88
Saved best model with BLEU=25.88


Epoch 9: 100%|██████████| 31250/31250 [18:07<00:00, 28.74it/s]


Epoch 9 - Loss: 2.9190
BLEU: 26.66
Saved best model with BLEU=26.66


Epoch 10: 100%|██████████| 31250/31250 [18:04<00:00, 28.81it/s]


Epoch 10 - Loss: 2.8834
BLEU: 26.91
Saved best model with BLEU=26.91


## Kết quả cuối cùng

In [21]:
checkpoint = torch.load("NMT_transformer_finetune.model")
model.load_state_dict(checkpoint['model_state'])
bleu, _ = evaluate_bleu_greedy(
    model, test_data_new, vocabidx_en, vocabidx_vi,
    max_len=60, device=DEVICE, print_samples=True, num_samples=10
)
print(f"BLEU: {bleu*100:.2f}")



--- Sample Translations ---
[1] EN : Knowledge, practices in public health service utilization among health insurance card’s holders and influencing factors in Vientiane, Lao
    PRED: Kiến thức, thực hành về sử dụng dịch vụ y tế công cộng của người dân có tham gia bảo hiểm y tế và các yếu tố ảnh hưởng đến kiến thức, thực hành của người dân tại tỉnh dân <unk>
    REF : Thực trạng kiến thức và thực hành của người có thẻ bảo hiểm y tế trong sử dụng dịch vụ khám chữa bệnh ở các cơ sở y tế công và một số yếu tố ảnh hưởng tại tỉnh Viêng Chăn, CHDCND Lào, năm 2017

[2] EN : Describe knowledge, practices in public health service utilization among health insurance card's holders and influencing factors in Vientiane, Lao PDR, 2017.
    PRED: Mô tả kiến thức, thực hành về sử dụng dịch vụ y tế công cộng của người dân có tham gia bảo hiểm y tế và các yếu tố ảnh hưởng đến kiến thức, thực hành của người dân tại tỉnh <unk> năm 2017.
    REF : Mô tả thực trạng kiến thức, thực hành của người có thẻ bả

In [25]:
# Load datasets
test_en  = load_file("./IWSLT/test.en")
test_vi  = load_file("./IWSLT/test.vi")

print(f"Data loaded: {len(test_en)} test pairs")

test_en_prep  = preprocess_with_special_tokens(test_en, vocabidx_en)

test_data = [
    (
        [vocabidx_en[token] for token in en_tokens],
        en_original,
        vi_original
    )
    for en_tokens, en_original, vi_original in zip(test_en_prep, test_en, test_vi)
]

vocablist_en, vocabidx_en, vocablist_vi, vocabidx_vi = load_vocab(
    VOCAB_DIR, "vocab_en_new.pkl", "vocab_vi_new.pkl"
)

Data loaded: 1268 test pairs


In [26]:
checkpoint = torch.load(FINETUNEMODELNAME, map_location=DEVICE)
config = checkpoint.get("config", None)
model = Transformer(config["src_vocab_size"], config["tgt_vocab_size"], config["d_model"], config["nhead"],
                    config["num_encoder_layers"], config["num_decoder_layers"],
                    config["dim_feedforward"], config["dropout"]).to(DEVICE)

In [27]:
model.load_state_dict(checkpoint['model_state'])
bleu, _ = evaluate_bleu_greedy(
    model, test_data, vocabidx_en, vocabidx_vi,
    max_len=60, device=DEVICE, print_samples=True, num_samples=10
)
print(f"Final BLEU Score on Released Corpus data: {bleu*100:.2f}")


--- Sample Translations ---
[1] EN : When I was little , I thought my country was the best on the planet , and I grew up singing a song called &quot; Nothing To Envy . &quot;
    PRED: Khi I ít gặp nhất là <unk> I được cho là tốt nhất trên toàn thế giới và <unk> I tăng lên một bài báo được gọi là <unk> <unk> <unk>
    REF : Khi tôi còn nhỏ , Tôi nghĩ rằng BắcTriều Tiên là đất nước tốt nhất trên thế giới và tôi thường hát bài &quot; Chúng ta chẳng có gì phải ghen tị . &quot;

[2] EN : And I was very proud .
    PRED: Và có thể rất dễ bị trầm cảm.
    REF : Tôi đã rất tự hào về đất nước tôi .

[3] EN : In school , we spent a lot of time studying the history of Kim Il-Sung , but we never learned much about the outside world , except that America , South Korea , Japan are the enemies .
    PRED: Trong trường học, chúng tôi đã dành nhiều thời gian nghiên cứu tiền sử của <unk> tuy nhiên chúng tôi chưa bao giờ học về bên ngoài thế giới ngoài thế giới và ngoại trừ <unk> <unk> <unk> <unk> là n