1. Setup & Imports

In [1]:
!pip install -q tokenizers sacrebleu

import os
import time
import math
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders
from tqdm import tqdm
import sacrebleu

# Setup Device & Seed
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hUsing device: cuda


2. CONFIGURATION

In [2]:
class Config:
    # --- Paths ---
    TRAIN_EN = '/kaggle/input/train-and-test/train.en.txt'
    TRAIN_VI = '/kaggle/input/train-and-test/train.vi.txt'
    TEST_EN = '/kaggle/input/train-and-test/public_test.en.txt'
    TEST_VI = '/kaggle/input/train-and-test/public_test.vi.txt'
    
    # --- Checkpoint Settings ---
    # Đặt True nếu bạn muốn load model cũ để train tiếp
    RESUME = True 
    # Đường dẫn file checkpoint cũ (nếu RESUME=True). 
    # Ví dụ: '/kaggle/input/my-previous-run/vlsp_checkpoint_last.pth'
    RESUME_PATH = '/kaggle/input/checkpoint-1-vlsp/vlsp_checkpoint_last.pth' 
    
    # Nơi lưu checkpoint mới
    SAVE_PATH = '/kaggle/working/vlsp_checkpoint_last.pth'
    BEST_MODEL_PATH = '/kaggle/working/vlsp_best_model.pth'

    # --- Model Args ---
    SRC_VOCAB_SIZE = 32000 
    TGT_VOCAB_SIZE = 32000
    D_MODEL = 512
    N_LAYERS = 6 
    HEADS = 8
    FF_DIM = 2048
    DROPOUT = 0.1
    MAX_LEN = 128 
    
    # --- Training Args ---
    BATCH_SIZE = 32 
    EPOCHS = 5      # Tổng số epoch muốn chạy
    LABEL_SMOOTHING = 0.1
    LR = 0.0001

config = Config()

3. DATA PROCESSING & TOKENIZER

In [3]:
def load_text_file(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        return [line.strip() for line in f if line.strip()]

def train_bpe_tokenizer(files, vocab_size=30000, model_name="bpe"):
    # Kiểm tra nếu đã có file tokenizer thì load lại đỡ phải train (tùy chọn)
    tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
    trainer = trainers.BpeTrainer(
        vocab_size=vocab_size, 
        special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"],
        show_progress=True
    )
    tokenizer.train(files, trainer)
    tokenizer.decoder = decoders.ByteLevel()
    return tokenizer

print("Loading data...")
train_src = load_text_file(config.TRAIN_EN)
train_tgt = load_text_file(config.TRAIN_VI)
test_src = load_text_file(config.TEST_EN)
test_tgt = load_text_file(config.TEST_VI)

# Train Tokenizers (Mỗi lần chạy lại nên train lại để đảm bảo khớp vocab, 
# trừ khi bạn lưu tokenizer ra file json riêng)
tokenizer_src = train_bpe_tokenizer([config.TRAIN_EN], config.SRC_VOCAB_SIZE, "src")
tokenizer_tgt = train_bpe_tokenizer([config.TRAIN_VI], config.TGT_VOCAB_SIZE, "tgt")

SOS_IDX = tokenizer_tgt.token_to_id("[SOS]")
EOS_IDX = tokenizer_tgt.token_to_id("[EOS]")
PAD_IDX = tokenizer_tgt.token_to_id("[PAD]")

class MedicalTranslationDataset(Dataset):
    def __init__(self, src_sents, tgt_sents, tok_src, tok_tgt, max_len):
        self.src_sents = src_sents
        self.tgt_sents = tgt_sents
        self.tok_src = tok_src
        self.tok_tgt = tok_tgt
        self.max_len = max_len

    def __len__(self): return len(self.src_sents)

    def __getitem__(self, idx):
        src_ids = self.tok_src.encode(self.src_sents[idx]).ids[:self.max_len-2]
        tgt_ids = self.tok_tgt.encode(self.tgt_sents[idx]).ids[:self.max_len-2]
        
        src_tensor = torch.tensor([SOS_IDX] + src_ids + [EOS_IDX], dtype=torch.long)
        tgt_tensor = torch.tensor([SOS_IDX] + tgt_ids + [EOS_IDX], dtype=torch.long)
        return src_tensor, tgt_tensor

def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)
    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX, batch_first=True)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX, batch_first=True)
    return src_batch, tgt_batch

train_dataset = MedicalTranslationDataset(train_src, train_tgt, tokenizer_src, tokenizer_tgt, config.MAX_LEN)
# Split Validation
train_size = int(0.9 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_set, val_set = torch.utils.data.random_split(train_dataset, [train_size, val_size])

train_loader = DataLoader(train_set, batch_size=config.BATCH_SIZE, shuffle=True, collate_fn=collate_fn, num_workers=2)
val_loader = DataLoader(val_set, batch_size=config.BATCH_SIZE, shuffle=False, collate_fn=collate_fn, num_workers=2)

Loading data...








4. TRANSFORMER MODEL ARCHITECTURE (Optimized)

In [4]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, heads, dropout=0.1):
        super().__init__()
        self.d_model, self.heads, self.head_dim = d_model, heads, d_model // heads
        self.q_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.out_linear = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, q, k, v, mask=None):
        bs = q.size(0)
        k = self.k_linear(k).view(bs, -1, self.heads, self.head_dim).transpose(1, 2)
        q = self.q_linear(q).view(bs, -1, self.heads, self.head_dim).transpose(1, 2)
        v = self.v_linear(v).view(bs, -1, self.heads, self.head_dim).transpose(1, 2)
        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.head_dim)
        if mask is not None: scores = scores.masked_fill(mask == 0, -1e9)
        attention = self.dropout(torch.softmax(scores, dim=-1))
        output = torch.matmul(attention, v).transpose(1, 2).contiguous().view(bs, -1, self.d_model)
        return self.out_linear(output)

class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, ff_dim, dropout=0.1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(d_model, ff_dim), nn.ReLU(), nn.Dropout(dropout),
            nn.Linear(ff_dim, d_model)
        )
    def forward(self, x): return self.net(x)

class EncoderLayer(nn.Module):
    def __init__(self, d_model, heads, ff_dim, dropout):
        super().__init__()
        self.attn = MultiHeadAttention(d_model, heads, dropout)
        self.ff = PositionwiseFeedForward(d_model, ff_dim, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
    def forward(self, x, mask):
        x = self.norm1(x + self.dropout(self.attn(x, x, x, mask)))
        return self.norm2(x + self.dropout(self.ff(x)))

class DecoderLayer(nn.Module):
    def __init__(self, d_model, heads, ff_dim, dropout):
        super().__init__()
        self.attn = MultiHeadAttention(d_model, heads, dropout)
        self.cross_attn = MultiHeadAttention(d_model, heads, dropout)
        self.ff = PositionwiseFeedForward(d_model, ff_dim, dropout)
        self.norm1, self.norm2, self.norm3 = [nn.LayerNorm(d_model) for _ in range(3)]
        self.dropout = nn.Dropout(dropout)
    def forward(self, x, enc_out, src_mask, trg_mask):
        x = self.norm1(x + self.dropout(self.attn(x, x, x, trg_mask)))
        x = self.norm2(x + self.dropout(self.cross_attn(x, enc_out, enc_out, src_mask)))
        return self.norm3(x + self.dropout(self.ff(x)))

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000, dropout=0.1):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len).unsqueeze(1).float()
        div = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(pos * div)
        pe[:, 1::2] = torch.cos(pos * div)
        self.register_buffer('pe', pe.unsqueeze(0))
    def forward(self, x): return self.dropout(x + self.pe[:, :x.size(1)])

class Transformer(nn.Module):
    def __init__(self, src_vocab, tgt_vocab, d_model, N, heads, ff_dim, dropout, max_len):
        super().__init__()
        self.src_embedding = nn.Embedding(src_vocab, d_model)
        self.tgt_embedding = nn.Embedding(tgt_vocab, d_model)
        self.pos_encoder = PositionalEncoding(d_model, max_len, dropout)
        self.encoder = nn.ModuleList([EncoderLayer(d_model, heads, ff_dim, dropout) for _ in range(N)])
        self.decoder = nn.ModuleList([DecoderLayer(d_model, heads, ff_dim, dropout) for _ in range(N)])
        self.fc = nn.Linear(d_model, tgt_vocab)
        self.d_model = d_model
        self._init_weights()

    def _init_weights(self):
        for p in self.parameters():
            if p.dim() > 1: nn.init.xavier_uniform_(p)

    def make_src_mask(self, src): return (src != PAD_IDX).unsqueeze(1).unsqueeze(2)
    
    def make_trg_mask(self, trg):
        mask = (trg != PAD_IDX).unsqueeze(1).unsqueeze(2)
        seq_len = trg.size(1)
        subsequent_mask = torch.tril(torch.ones((seq_len, seq_len), device=trg.device)).bool()
        return mask & subsequent_mask

    def forward(self, src, trg):
        src_mask, trg_mask = self.make_src_mask(src), self.make_trg_mask(trg)
        enc = self.pos_encoder(self.src_embedding(src) * math.sqrt(self.d_model))
        for layer in self.encoder: enc = layer(enc, src_mask)
        dec = self.pos_encoder(self.tgt_embedding(trg) * math.sqrt(self.d_model))
        for layer in self.decoder: dec = layer(dec, enc, src_mask, trg_mask)
        return self.fc(dec)

5. SAVE & LOAD CHECKPOINT UTILS

In [5]:
def save_checkpoint(state, filename):
    print(f"=> Saving checkpoint to {filename}")
    torch.save(state, filename)

def load_checkpoint(checkpoint_path, model, optimizer):
    print(f"=> Loading checkpoint from {checkpoint_path}")
    checkpoint = torch.load(checkpoint_path, map_location=device)
    model.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])
    start_epoch = checkpoint['epoch'] + 1
    best_loss = checkpoint.get('best_loss', float('inf'))
    print(f"=> Loaded checkpoint (epoch {checkpoint['epoch']})")
    return start_epoch, best_loss

6. TRAINING LOOP WITH RESUME SUPPORT

In [6]:
model = Transformer(
    config.SRC_VOCAB_SIZE, config.TGT_VOCAB_SIZE, config.D_MODEL, 
    config.N_LAYERS, config.HEADS, config.FF_DIM, config.DROPOUT, config.MAX_LEN
).to(device)

optimizer = optim.Adam(model.parameters(), lr=config.LR, betas=(0.9, 0.98), eps=1e-9)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX, label_smoothing=config.LABEL_SMOOTHING)

# Variables for tracking
start_epoch = 0
best_valid_loss = float('inf')

# Logic resume
if config.RESUME:
    if os.path.isfile(config.RESUME_PATH):
        start_epoch, best_valid_loss = load_checkpoint(config.RESUME_PATH, model, optimizer)
    else:
        print(f"=> No checkpoint found at '{config.RESUME_PATH}', starting from scratch")

# Functions train/eval
def train_step(model, loader, optimizer, criterion):
    model.train()
    epoch_loss = 0
    for src, tgt in tqdm(loader, desc="Train", leave=False):
        src, tgt = src.to(device), tgt.to(device)
        tgt_in, tgt_out = tgt[:, :-1], tgt[:, 1:]
        optimizer.zero_grad()
        output = model(src, tgt_in)
        loss = criterion(output.reshape(-1, output.shape[-1]), tgt_out.reshape(-1))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(loader)

def eval_step(model, loader, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for src, tgt in loader:
            src, tgt = src.to(device), tgt.to(device)
            tgt_in, tgt_out = tgt[:, :-1], tgt[:, 1:]
            output = model(src, tgt_in)
            loss = criterion(output.reshape(-1, output.shape[-1]), tgt_out.reshape(-1))
            epoch_loss += loss.item()
    return epoch_loss / len(loader)

print(f"Starting training from epoch {start_epoch} to {config.EPOCHS}...")

for epoch in range(start_epoch, config.EPOCHS):
    start_time = time.time()
    train_loss = train_step(model, train_loader, optimizer, criterion)
    valid_loss = eval_step(model, val_loader, criterion)
    
    # Save checkpoint cho việc resume sau này (lưu đè lên file cũ để đỡ tốn dung lượng)
    # Lưu mọi thứ cần thiết
    checkpoint_state = {
        'epoch': epoch,
        'state_dict': model.state_dict(),
        'optimizer': optimizer.state_dict(),
        'best_loss': best_valid_loss
    }
    save_checkpoint(checkpoint_state, config.SAVE_PATH)
    
    # Lưu best model riêng (chỉ cần weights để inference cho nhẹ)
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), config.BEST_MODEL_PATH)
        print(f"--> NEW BEST MODEL: {valid_loss:.4f}")
    
    print(f"Epoch {epoch+1}/{config.EPOCHS} | Train Loss: {train_loss:.4f} | Val Loss: {valid_loss:.4f} | Time: {(time.time()-start_time)/60:.1f}m")

=> Loading checkpoint from /kaggle/input/checkpoint-1-vlsp/vlsp_checkpoint_last.pth
=> Loaded checkpoint (epoch 0)
Starting training from epoch 1 to 5...


Train:   0%|          | 0/14063 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. D

=> Saving checkpoint to /kaggle/working/vlsp_checkpoint_last.pth
--> NEW BEST MODEL: 3.2254
Epoch 2/5 | Train Loss: 3.6096 | Val Loss: 3.2254 | Time: 109.2m


Train:   0%|          | 0/14063 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. D

=> Saving checkpoint to /kaggle/working/vlsp_checkpoint_last.pth
--> NEW BEST MODEL: 3.0012
Epoch 3/5 | Train Loss: 3.2164 | Val Loss: 3.0012 | Time: 109.2m


Train:   0%|          | 0/14063 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. D

=> Saving checkpoint to /kaggle/working/vlsp_checkpoint_last.pth
--> NEW BEST MODEL: 2.8888
Epoch 4/5 | Train Loss: 3.0313 | Val Loss: 2.8888 | Time: 109.3m


Train:   0%|          | 0/14063 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. D

=> Saving checkpoint to /kaggle/working/vlsp_checkpoint_last.pth
--> NEW BEST MODEL: 2.8102
Epoch 5/5 | Train Loss: 2.9172 | Val Loss: 2.8102 | Time: 109.2m


7. BEAM SEARCH & EVALUATION

In [7]:
def beam_search(model, src, beam_size=5, max_len=128):
    model.eval()
    src = src.unsqueeze(0).to(device)
    src_mask = model.make_src_mask(src)
    with torch.no_grad():
        enc = model.pos_encoder(model.src_embedding(src) * math.sqrt(model.d_model))
        for layer in model.encoder: enc = layer(enc, src_mask)
    
    beam = [(0.0, [SOS_IDX])]
    for _ in range(max_len):
        candidates = []
        for score, seq in beam:
            if seq[-1] == EOS_IDX:
                candidates.append((score, seq))
                continue
            trg = torch.tensor([seq], device=device)
            trg_mask = model.make_trg_mask(trg)
            dec = model.pos_encoder(model.tgt_embedding(trg) * math.sqrt(model.d_model))
            for layer in model.decoder: dec = layer(dec, enc, src_mask, trg_mask)
            prob = torch.log_softmax(model.fc(dec)[:, -1, :], dim=-1).squeeze(0)
            topk_prob, topk_idx = torch.topk(prob, beam_size)
            for i in range(beam_size):
                candidates.append((score + topk_prob[i].item(), seq + [topk_idx[i].item()]))
        beam = sorted(candidates, key=lambda x: x[0], reverse=True)[:beam_size]
        if beam[0][1][-1] == EOS_IDX: break
    return beam[0][1]

def translate(text):
    ids = tokenizer_src.encode(text).ids
    src_tensor = torch.tensor([SOS_IDX] + ids + [EOS_IDX], dtype=torch.long)
    out_ids = beam_search(model, src_tensor)
    return tokenizer_tgt.decode([i for i in out_ids if i not in [SOS_IDX, EOS_IDX, PAD_IDX]])

# Load best model để evaluate
print("\nLoading Best Model for Evaluation...")
model.load_state_dict(torch.load(config.BEST_MODEL_PATH))

refs, hyps = [], []
print("Evaluating on Public Test (First 10 examples)...")
for i in range(10):
    src = test_src[i]
    tgt = test_tgt[i]
    pred = translate(src)
    refs.append([tgt])
    hyps.append(pred)
    print(f"Src: {src}\nRef: {tgt}\nHyp: {pred}\n{'-'*30}")

bleu = sacrebleu.corpus_bleu(hyps, refs)
print(f"BLEU Score: {bleu.score}")


Loading Best Model for Evaluation...
Evaluating on Public Test (First 10 examples)...
Src: Knowledge, practices in public health service utilization among health insurance card’s holders and influencing factors in Vientiane, Lao
Ref: Thực trạng kiến thức và thực hành của người có thẻ bảo hiểm y tế trong sử dụng dịch vụ khám chữa bệnh ở các cơ sở y tế công và một số yếu tố ảnh hưởng tại tỉnh Viêng Chăn, CHDCND Lào, năm 2017
Hyp: Kiến thức, thực hành về sử dụng dịch vụ y tế công cộng trong sử dụng bảo hiểm y tế và các yếu tố ảnh hưởng tại các tỉnh miền núi phía Bắc
------------------------------
Src: Describe knowledge, practices in public health service utilization among health insurance card's holders and influencing factors in Vientiane, Lao PDR, 2017.
Ref: Mô tả thực trạng kiến thức, thực hành của người có thẻ bảo hiểm y tế trong sử dụng dịch vụ khám chữa bệnh ở các cơ sở y tế công và một số yếu tố liên quan tại tỉnh Viêng Chăn, Cộng hoà Dân chủ Nhân dân Lào năm 2017.
Hyp: Mô tả kiế