In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import sentencepiece as spm  # <--- THAY ƒê·ªîI QUAN TR·ªåNG
import math
import time
import os
from tqdm import tqdm

# --- C·∫§U H√åNH ƒê∆Ø·ªúNG D·∫™N (S·ª¨A L·∫†I CH·ªñ N√ÄY) ---
# Th∆∞ m·ª•c ch·ª©a c√°c file: train.bpe.en, train.bpe.vi, iwslt_bpe.model
INPUT_DIR = "/kaggle/input/iwslt-dataprocessing" 
BPE_MODEL_PATH = '/kaggle/input/tokenizer-iwslt/iwslt_bpe.model'

# C√°c file d·ªØ li·ªáu (ƒê√£ tokenize BPE ·ªü b∆∞·ªõc tr∆∞·ªõc)
TRAIN_SRC_FILE = '/kaggle/input/tokenizer-iwslt/train.bpe.vi'
TRAIN_TRG_FILE = '/kaggle/input/tokenizer-iwslt/train.bpe.en'
# (T·∫°m th·ªùi d√πng train l√†m valid n·∫øu b·∫°n ch∆∞a t·∫°o file valid.bpe ri√™ng)
VAL_SRC_FILE =  '/kaggle/input/tokenizer-iwslt/valid.bpe.vi'
VAL_TRG_FILE = '/kaggle/input/tokenizer-iwslt/valid.bpe.en'

# --- HYPERPARAMETERS ---
MAX_LEN = 256      # Gi·∫£m xu·ªëng ch√∫t cho nh·∫π n·∫øu c·∫ßn
BATCH_SIZE = 128
N_EPOCHS = 40
LEARNING_RATE = 0.0005
CLIP = 1

# Load SentencePiece ƒë·ªÉ l·∫•y th√¥ng s·ªë Vocab
sp = spm.SentencePieceProcessor()
sp.load(BPE_MODEL_PATH)

INPUT_DIM = sp.get_piece_size() # L·∫•y t·ª± ƒë·ªông t·ª´ file model (kho·∫£ng 16000)
OUTPUT_DIM = sp.get_piece_size()
D_MODEL = 256
N_HEAD = 4
D_FF = 1024
N_LAYERS = 4
DROP_PROB = 0.3
PATIENCE = 10
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f"Vocab Size loaded: {INPUT_DIM}")
print(f"Device: {device}")

Vocab Size loaded: 16000
Device: cuda


In [2]:
class TranslationDataset(Dataset):
    def __init__(self, src_path, trg_path, sp_model_path):
        # Load SentencePiece Model
        self.sp = spm.SentencePieceProcessor()
        self.sp.load(sp_model_path)
        
        # L·∫•y ID c·ªßa c√°c token ƒë·∫∑c bi·ªát
        self.bos_id = self.sp.bos_id()
        self.eos_id = self.sp.eos_id()
        self.pad_id = self.sp.pad_id()
        
        print(f"‚è≥ ƒêang ƒë·ªçc d·ªØ li·ªáu t·ª´: {os.path.basename(src_path)} & {os.path.basename(trg_path)}...")
        with open(src_path, 'r', encoding='utf-8') as f:
            self.src_data = f.readlines()
        with open(trg_path, 'r', encoding='utf-8') as f:
            self.trg_data = f.readlines()
            
    def __len__(self):
        return len(self.src_data)

    def __getitem__(self, idx):
        # 1. L·∫•y d√≤ng text BPE (VD: "_hello _world")
        src_line = self.src_data[idx].strip()
        trg_line = self.trg_data[idx].strip()
        
        # 2. Chuy·ªÉn th√†nh List tokens (String) -> List IDs (Int)
        # L∆∞u √Ω: D√πng piece_to_id v√¨ text ƒë√£ ƒë∆∞·ª£c tokenize s·∫µn
        src_ids = self.sp.piece_to_id(src_line.split())
        trg_ids = self.sp.piece_to_id(trg_line.split())
        
        # 3. Th√™m BOS v√† EOS v√†o ƒë·∫ßu cu·ªëi
        src_tensor = torch.tensor([self.bos_id] + src_ids + [self.eos_id])
        trg_tensor = torch.tensor([self.bos_id] + trg_ids + [self.eos_id])
        
        return src_tensor, trg_tensor

In [3]:
# --- H√ÄM COLLATE (GOM BATCH) ---
class Collate:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx
        
    def __call__(self, batch):
        src, trg = zip(*batch)
        # Pad ƒë·ªÉ c√°c c√¢u trong batch b·∫±ng nhau
        src = pad_sequence(src, padding_value=self.pad_idx, batch_first=True)
        trg = pad_sequence(trg, padding_value=self.pad_idx, batch_first=True)
        return src, trg

In [4]:
# --- KH·ªûI T·∫†O DATASET & DATALOADER ---
# L·∫•y PAD ID t·ª´ model ƒë√£ load ·ªü cell 1
PAD_IDX = sp.pad_id() 

print("üîÑ ƒêang kh·ªüi t·∫°o Train Loader...")
train_ds = TranslationDataset(TRAIN_SRC_FILE, TRAIN_TRG_FILE, BPE_MODEL_PATH)
train_loader = DataLoader(
    train_ds, 
    batch_size=BATCH_SIZE, 
    shuffle=True, 
    collate_fn=Collate(PAD_IDX), 
    num_workers=2
)

print("üîÑ ƒêang kh·ªüi t·∫°o Valid Loader...")
# L∆∞u √Ω: N·∫øu b·∫°n ch∆∞a c√≥ file valid ri√™ng, d√πng t·∫°m file train (ho·∫∑c split ra)
# ·ªû ƒë√¢y m√¨nh demo d√πng lu√¥n file config ·ªü cell 1
val_ds = TranslationDataset(VAL_SRC_FILE, VAL_TRG_FILE, BPE_MODEL_PATH)
valid_loader = DataLoader(
    val_ds, 
    batch_size=BATCH_SIZE, 
    shuffle=False, 
    collate_fn=Collate(PAD_IDX), 
    num_workers=2
)

print(f"‚úÖ ƒê√£ s·∫µn s√†ng! Train size: {len(train_ds)}, Valid size: {len(val_ds)}")

üîÑ ƒêang kh·ªüi t·∫°o Train Loader...
‚è≥ ƒêang ƒë·ªçc d·ªØ li·ªáu t·ª´: train.bpe.vi & train.bpe.en...
üîÑ ƒêang kh·ªüi t·∫°o Valid Loader...
‚è≥ ƒêang ƒë·ªçc d·ªØ li·ªáu t·ª´: valid.bpe.vi & valid.bpe.en...
‚úÖ ƒê√£ s·∫µn s√†ng! Train size: 128068, Valid size: 1553


In [5]:
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for src, trg in iterator:
            src, trg = src.to(device), trg.to(device)
            trg_input, trg_label = trg[:, :-1], trg[:, 1:]
            
            output = model(src, trg_input)
            output = output.contiguous().view(-1, output.shape[-1])
            trg_label = trg_label.contiguous().view(-1)
            
            loss = criterion(output, trg_label)
            epoch_loss += loss.item()
    return epoch_loss / len(iterator)

In [6]:
class TransformerEmbedding(nn.Module):
    def __init__(self, vocab_size, d_model, max_len, drop_prob):
        super().__init__()
        self.tok_emb = nn.Embedding(vocab_size, d_model)
        self.d_model = d_model
        
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))
        self.dropout = nn.Dropout(drop_prob)

    def forward(self, x):
        emb = self.tok_emb(x) * math.sqrt(self.d_model)
        pos = self.pe[:, :x.size(1)]
        return self.dropout(emb + pos)

In [7]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_head):
        super().__init__()
        self.d_model = d_model
        self.n_head = n_head
        self.head_dim = d_model // n_head
        
        # ƒê·∫£m b·∫£o d_model chia h·∫øt cho s·ªë head
        assert self.head_dim * n_head == d_model, "d_model ph·∫£i chia h·∫øt cho n_head"

        # 1. C√°c l·ªõp Linear ƒë·ªÉ chi·∫øu Q, K, V
        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)
        
        # L·ªõp Linear cu·ªëi c√πng sau khi n·ªëi c√°c head l·∫°i
        self.w_o = nn.Linear(d_model, d_model)

    def forward(self, q, k, v, mask=None):
        """
        q, k, v shape: [Batch_Size, Seq_Len, d_model]
        mask shape: [Batch_Size, 1, 1, Seq_Len] ho·∫∑c [Batch_Size, 1, Seq_Len, Seq_Len]
        """
        batch_size = q.size(0)

        # 1. Chi·∫øu Q, K, V qua Linear layer
        # Sau ƒë√≥ t√°ch th√†nh n_head: [Batch, Seq, Head, Dim] -> [Batch, Head, Seq, Dim]
        # Transpose ƒë·ªÉ ƒë∆∞a chi·ªÅu Head l√™n tr∆∞·ªõc chi·ªÅu Seq -> ƒê·ªÉ nh√¢n ma tr·∫≠n song song c√°c head
        Q = self.w_q(q).view(batch_size, -1, self.n_head, self.head_dim).transpose(1, 2)
        K = self.w_k(k).view(batch_size, -1, self.n_head, self.head_dim).transpose(1, 2)
        V = self.w_v(v).view(batch_size, -1, self.n_head, self.head_dim).transpose(1, 2)

        # 2. T√≠nh Scaled Dot-Product Attention
        # Score = (Q * K^T) / sqrt(d_k)
        # K.transpose(-2, -1) l√† chuy·ªÉn v·ªã 2 chi·ªÅu cu·ªëi (Seq, Dim) -> (Dim, Seq)
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)
        
        # 3. √Åp d·ª•ng Mask (N·∫øu c√≥)
        # Mask th∆∞·ªùng ch·ª©a 0 (che) v√† 1 (gi·ªØ). Ta thay v·ªã tr√≠ 0 b·∫±ng s·ªë √¢m v√¥ c√πng (-1e9)
        # ƒë·ªÉ khi qua Softmax n√≥ bi·∫øn th√†nh 0.
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        
        # 4. Softmax ƒë·ªÉ ra x√°c su·∫•t
        attention_weights = torch.softmax(scores, dim=-1)
        
        # 5. Nh√¢n v·ªõi V
        # Output: [Batch, Head, Seq, Dim]
        output = torch.matmul(attention_weights, V)
        
        # 6. Gom c√°c head l·∫°i (Concatenate)
        # [Batch, Head, Seq, Dim] -> [Batch, Seq, Head, Dim] -> [Batch, Seq, d_model]
        output = output.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        
        # 7. ƒêi qua l·ªõp Linear cu·ªëi c√πng
        return self.w_o(output)

In [8]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff, drop_prob=0.1):
        super().__init__()
        # d_ff th∆∞·ªùng l·ªõn g·∫•p 4 l·∫ßn d_model (v√≠ d·ª•: 512 -> 2048)
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=drop_prob)

    def forward(self, x):
        # x: [Batch, Seq_Len, d_model]
        x = self.linear1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.linear2(x)
        return x

In [9]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, n_head, d_ff, drop_prob=0.1):
        super().__init__()
        self.attention = MultiHeadAttention(d_model, n_head)
        self.norm1 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(drop_prob)
        
        self.ffn = PositionwiseFeedForward(d_model, d_ff, drop_prob)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout2 = nn.Dropout(drop_prob)

    def forward(self, x, mask=None):
        # 1. Sub-layer 1: Self Attention
        # L∆∞u l·∫°i x ban ƒë·∫ßu ƒë·ªÉ c·ªông (Residual Connection)
        _x = x
        x = self.attention(q=x, k=x, v=x, mask=mask) # Self-Attention: q=k=v=x
        x = self.dropout1(x)
        x = self.norm1(x + _x) # Add & Norm
        
        # 2. Sub-layer 2: Feed Forward
        _x = x
        x = self.ffn(x)
        x = self.dropout2(x)
        x = self.norm2(x + _x) # Add & Norm
        
        return x

In [10]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, d_model, n_head, d_ff, n_layer, max_len, drop_prob, device):
        super().__init__()
        self.device = device
        
        # Embedding + Positional Encoding (ƒê√£ code ·ªü b√†i tr∆∞·ªõc)
        self.embedding = TransformerEmbedding(vocab_size, d_model, max_len, drop_prob)
        
        # Ch·ªìng N l·ªõp EncoderLayer
        self.layers = nn.ModuleList([
            EncoderLayer(d_model, n_head, d_ff, drop_prob) 
            for _ in range(n_layer)
        ])
        
    def forward(self, src, mask=None):
        # src: [Batch, Seq_Len]
        x = self.embedding(src)
        
        # Cho ƒëi qua l·∫ßn l∆∞·ª£t t·ª´ng l·ªõp Encoder
        for layer in self.layers:
            x = layer(x, mask)
        
        return x

In [11]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, n_head, d_ff, drop_prob=0.1):
        super().__init__()
        
        # 1. Self Attention (C√≥ Mask che t∆∞∆°ng lai)
        self.self_attention = MultiHeadAttention(d_model, n_head)
        self.norm1 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(drop_prob)
        
        # 2. Cross Attention (Quan tr·ªçng: L·∫•y Key, Value t·ª´ Encoder)
        self.cross_attention = MultiHeadAttention(d_model, n_head)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout2 = nn.Dropout(drop_prob)
        
        # 3. Feed Forward
        self.ffn = PositionwiseFeedForward(d_model, d_ff, drop_prob)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout3 = nn.Dropout(drop_prob)

    def forward(self, trg, enc_src, trg_mask, src_mask):
        """
        trg: Input c·ªßa Decoder (c√¢u ti·∫øng Vi·ªát ƒëang d·ªãch d·ªü)
        enc_src: Output t·ª´ Encoder (c√¢u ti·∫øng Anh ƒë√£ hi·ªÉu xong)
        trg_mask: Mask che t∆∞∆°ng lai cho trg
        src_mask: Mask che padding cho src
        """
        # --- Block 1: Masked Self-Attention ---
        # Decoder t·ª± nh√¨n l·∫°i ch√≠nh n√≥ (nh∆∞ng kh√¥ng ƒë∆∞·ª£c nh√¨n t∆∞∆°ng lai)
        _trg = trg
        # Quan tr·ªçng: trg_mask d√πng ·ªü ƒë√¢y
        trg = self.self_attention(q=trg, k=trg, v=trg, mask=trg_mask)
        trg = self.dropout1(trg)
        trg = self.norm1(trg + _trg) # Add & Norm

        # --- Block 2: Cross-Attention (Encoder-Decoder Attention) ---
        # Decoder l·∫•y th√¥ng tin t·ª´ Encoder
        # Query (Q) ƒë·∫øn t·ª´ Decoder (trg)
        # Key (K) v√† Value (V) ƒë·∫øn t·ª´ Encoder (enc_src)
        _trg = trg
        # Quan tr·ªçng: src_mask d√πng ·ªü ƒë√¢y (ƒë·ªÉ kh√¥ng nh√¨n v√†o padding c·ªßa ti·∫øng Anh)
        trg = self.cross_attention(q=trg, k=enc_src, v=enc_src, mask=src_mask)
        trg = self.dropout2(trg)
        trg = self.norm2(trg + _trg)

        # --- Block 3: Feed Forward ---
        _trg = trg
        trg = self.ffn(trg)
        trg = self.dropout3(trg)
        trg = self.norm3(trg + _trg)

        return trg

In [12]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, d_model, n_head, d_ff, n_layer, max_len, drop_prob, device):
        super().__init__()
        self.device = device
        
        # Embedding ri√™ng cho Decoder (Ti·∫øng Vi·ªát)
        self.embedding = TransformerEmbedding(vocab_size, d_model, max_len, drop_prob)
        
        self.layers = nn.ModuleList([
            DecoderLayer(d_model, n_head, d_ff, drop_prob)
            for _ in range(n_layer)
        ])
        
        # L·ªõp Linear cu·ªëi c√πng ƒë·ªÉ d·ª± ƒëo√°n t·ª´ ti·∫øp theo
        self.fc_out = nn.Linear(d_model, vocab_size)

    def forward(self, trg, enc_src, trg_mask, src_mask):
        # trg: [Batch, Seq_Len]
        trg = self.embedding(trg)
        
        for layer in self.layers:
            trg = layer(trg, enc_src, trg_mask, src_mask)
            
        # Output: [Batch, Seq_Len, Vocab_Size]
        output = self.fc_out(trg)
        return output

In [13]:
class Transformer(nn.Module):
    def __init__(self, encoder, decoder, src_pad_idx, trg_pad_idx, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device
        
    def make_src_mask(self, src):
        # src shape: [Batch, Src_Len]
        
        # T·∫°o mask cho v·ªã tr√≠ padding (True n·∫øu != pad, False n·∫øu == pad)
        # Ho·∫∑c ng∆∞·ª£c l·∫°i t√πy quy ∆∞·ªõc, ·ªü ƒë√¢y ta d√πng quy ∆∞·ªõc: 1 l√† gi·ªØ, 0 l√† che
        # unsqueeze(1) v√† (2) ƒë·ªÉ m·ªü r·ªông chi·ªÅu cho kh·ªõp v·ªõi Attention Heads
        # Shape mong mu·ªën: [Batch, 1, 1, Src_Len]
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)

        return src_mask.to(self.device)

    def make_trg_mask(self, trg):
        # trg shape: [Batch, Trg_Len]
        
        # 1. Padding Mask: Che c√°c v·ªã tr√≠ pad trong c√¢u ƒë√≠ch
        # Shape: [Batch, 1, 1, Trg_Len]
        trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(2)
        
        # 2. Look-ahead Mask: Ma tr·∫≠n tam gi√°c
        trg_len = trg.shape[1]
        # torch.tril t·∫°o ma tr·∫≠n tam gi√°c d∆∞·ªõi (s·ªë 1 ·ªü d∆∞·ªõi ƒë∆∞·ªùng ch√©o, s·ªë 0 ·ªü tr√™n)
        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device=self.device)).bool()
        
        # 3. K·∫øt h·ª£p c·∫£ 2: V·ª´a ph·∫£i kh√¥ng ph·∫£i pad, v·ª´a ph·∫£i n·∫±m trong tam gi√°c d∆∞·ªõi
        # Shape: [Batch, 1, Trg_Len, Trg_Len]
        trg_mask = trg_pad_mask & trg_sub_mask
        
        return trg_mask.to(self.device)

    def forward(self, src, trg):
        """
        src: [Batch, Src_Len]
        trg: [Batch, Trg_Len] (L∆∞u √Ω: trg n√†y l√† Input cho Decoder, ƒë√£ b·ªè token cu·ªëi)
        """
        # 1. T·∫°o Mask
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        
        # 2. Ch·∫°y qua Encoder
        enc_src = self.encoder(src, src_mask)
        
        # 3. Ch·∫°y qua Decoder
        # L∆∞u √Ω: Decoder c·∫ßn c·∫£ src_mask ƒë·ªÉ tr√°nh Cross-Attention nh√¨n v√†o padding c·ªßa src
        output = self.decoder(trg, enc_src, trg_mask, src_mask)
        
        return output

In [14]:
def initialize_weights(m):
    """
    H√†m kh·ªüi t·∫°o tr·ªçng s·ªë Xavier (Glorot) Uniform.
    R·∫•t quan tr·ªçng ƒë·ªÉ Transformer h·ªôi t·ª• nhanh khi train t·ª´ ƒë·∫ßu.
    """
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)

print("üöÄ Kh·ªüi t·∫°o Model M·ªöI (Train from scratch)...")

# 1. Kh·ªüi t·∫°o c√°c th√†nh ph·∫ßn
enc = Encoder(INPUT_DIM, D_MODEL, N_HEAD, D_FF, N_LAYERS, MAX_LEN, DROP_PROB, device)
dec = Decoder(OUTPUT_DIM, D_MODEL, N_HEAD, D_FF, N_LAYERS, MAX_LEN, DROP_PROB, device)

# 2. T·∫°o Model t·ªïng
# L∆∞u √Ω: PAD_IDX l·∫•y t·ª´ bi·∫øn sp.pad_id() ·ªü cell tr√™n
model = Transformer(enc, dec, PAD_IDX, PAD_IDX, device).to(device)

# # 3. √Åp d·ª•ng kh·ªüi t·∫°o tr·ªçng s·ªë (QUAN TR·ªåNG)
# model.apply(initialize_weights)
# print("‚ú® ƒê√£ kh·ªüi t·∫°o tham s·ªë ng·∫´u nhi√™n (Xavier Init).")

# 4. Ki·ªÉm tra s·ªë l∆∞·ª£ng tham s·ªë
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"üìä T·ªïng s·ªë tham s·ªë (Trainable Parameters): {count_parameters(model):,}")

üöÄ Kh·ªüi t·∫°o Model M·ªöI (Train from scratch)...
üìä T·ªïng s·ªë tham s·ªë (Trainable Parameters): 19,676,800


In [15]:
import math
import time

# --- C·∫§U H√åNH TRAIN TI·∫æP (PHASE 2) ---
PRETRAINED_PATH = '/kaggle/input/transformer-training-vi2en/transformer_best_en2vi_finetunedV2.pt'  # File g·ªëc (Epoch 40)
NEW_SAVE_PATH = 'transformer_small_vi2en_v2.pt'      # File m·ªõi (Epoch 50)
EXTRA_EPOCHS = 10

# 1. Load tr·ªçng s·ªë c≈©
print(f"üîÑ Loading weights from {PRETRAINED_PATH}...")
model.load_state_dict(torch.load(PRETRAINED_PATH, map_location=device))

# 2. Optimizer LR th·∫•p & c·ªë ƒë·ªãnh (cho Fine-tuning)
optimizer = optim.Adam(
    model.parameters(), 
    lr=0.0001,  # LR nh·ªè ƒë·ªÉ nh√≠ch t·ª´ t·ª´
    betas=(0.9, 0.98), 
    eps=1e-9,
    weight_decay=1e-4
)

criterion = nn.CrossEntropyLoss(
    ignore_index=PAD_IDX, 
    label_smoothing=0.1
)

# 3. Reset Best Loss (L·∫•y m·ªëc c·ªßa model c≈©, v√≠ d·ª• 3.4 ho·∫∑c 3.5)
best_valid_loss = 3.5 

print(f"üöÄ B·∫Øt ƒë·∫ßu Train th√™m {EXTRA_EPOCHS} epochs (No Progress Bar)...")
print(f"{'Epoch':^5} | {'Train Loss':^10} | {'Val Loss':^10} | {'Val PPL':^10} | {'Time':^10}")
print("-" * 55)

for epoch in range(EXTRA_EPOCHS):
    start_time = time.time()
    
    # --- TRAIN ---
    model.train()
    train_loss = 0
    
    # Loop tr·ª±c ti·∫øp qua loader, kh√¥ng d√πng tqdm
    for src, trg in train_loader:
        src, trg = src.to(device), trg.to(device)
        trg_input, trg_label = trg[:, :-1], trg[:, 1:]
        
        optimizer.zero_grad()
        output = model(src, trg_input)
        output = output.contiguous().view(-1, output.shape[-1])
        trg_label = trg_label.contiguous().view(-1)
        
        loss = criterion(output, trg_label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP)
        optimizer.step()
        
        train_loss += loss.item()

    # --- EVALUATE ---
    # (ƒê·∫£m b·∫£o b·∫°n c√≥ h√†m evaluate tr·∫£ v·ªÅ loss trung b√¨nh)
    valid_loss = evaluate(model, valid_loader, criterion)
    
    # T√≠nh to√°n ch·ªâ s·ªë
    avg_train_loss = train_loss / len(train_loader)
    valid_ppl = math.exp(valid_loss) if valid_loss < 100 else float('inf')
    end_time = time.time()
    epoch_mins, epoch_secs = divmod(end_time - start_time, 60)
    
    # --- IN K·∫æT QU·∫¢ ---
    print(f"{epoch+1:^5} | {avg_train_loss:^10.3f} | {valid_loss:^10.3f} | {valid_ppl:^10.3f} | {int(epoch_mins)}m {int(epoch_secs)}s")
    
    # --- SAVE ---
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), NEW_SAVE_PATH)
        print(f"      --> üíæ Saved Best V2 (Loss: {valid_loss:.3f})")

print("‚úÖ DONE FINE-TUNING!")

üîÑ Loading weights from /kaggle/input/transformer-training-vi2en/transformer_best_en2vi_finetunedV2.pt...
üöÄ B·∫Øt ƒë·∫ßu Train th√™m 10 epochs (No Progress Bar)...
Epoch | Train Loss |  Val Loss  |  Val PPL   |    Time   
-------------------------------------------------------
  1   |   3.377    |   3.422    |   30.615   | 3m 41s
      --> üíæ Saved Best V2 (Loss: 3.422)
  2   |   3.343    |   3.420    |   30.564   | 3m 41s
      --> üíæ Saved Best V2 (Loss: 3.420)
  3   |   3.330    |   3.408    |   30.210   | 3m 40s
      --> üíæ Saved Best V2 (Loss: 3.408)
  4   |   3.324    |   3.408    |   30.214   | 3m 41s
  5   |   3.319    |   3.404    |   30.073   | 3m 41s
      --> üíæ Saved Best V2 (Loss: 3.404)
  6   |   3.318    |   3.407    |   30.178   | 3m 40s
  7   |   3.318    |   3.405    |   30.110   | 3m 40s
  8   |   3.317    |   3.404    |   30.087   | 3m 40s
  9   |   3.318    |   3.408    |   30.190   | 3m 41s
 10   |   3.319    |   3.407    |   30.171   | 3m 41s
‚úÖ D