In [1]:
!pip install -q pyvi

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m56.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m60.6 MB/s[0m eta [36m0:00:00[0m
[?25h

# Config

In [2]:
# Đường dẫn dữ liệu
import torch
import os

data_path = '/kaggle/input/iwslt15-englishvietnamese/IWSLT\'15 en-vi/'
train_data_path = '/kaggle/input/iwslt15-englishvietnamese/IWSLT\'15 en-vi/'
saved_model_path = '/kaggle/working/'
saved_tokenizer_path = '/kaggle/working/'
test_data_path = 'data/test_data/'

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

MAX_SEQ_LEN = 60  # Độ dài tối đa của câu

seed = 42

# Huấn luyện mô hình
NUM_LAYERS = 6
D_MODEL = 512
D_FF = 2048
EPS = 0.1
BATCH_SIZE = 164 #---GA params---
NUM_HEADS = 8
EPOCHS = 30
DROPOUT = 0.2 #---GA params---
CLIP = 1.0
BATCH_PRINT = 100

#Learning rate
LEARNING_RATE = 1e-4 #---GA params---
DECAY_RATE = [1.3, 0.95]
DECAY_STEP = [3600]
DECAY_INTERVAL = 390
WEIGHT_DECAY = 1e-4 #---GA params---

UNKNOWN_TOKEN = '<unk>'
PAD_TOKEN = '<pad>'
START_TOKEN = '<start>'
END_TOKEN = '<end>'


PAD_TOKEN_POS = 0

#output
OUTPUT_DIR = "output"
os.makedirs(OUTPUT_DIR, exist_ok=True)

JSON_LOG_PATH = os.path.join(OUTPUT_DIR, "train.json")
CSV_LOG_PATH  = os.path.join(OUTPUT_DIR, "train.csv")

In [3]:
def log_epoch(record, json_path, csv_path):
    # ---- JSON ----
    if os.path.exists(json_path):
        with open(json_path, "r", encoding="utf-8") as f:
            logs = json.load(f)
    else:
        logs = []

    logs.append(record)

    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(logs, f, indent=2, ensure_ascii=False)

    # ---- CSV ----
    write_header = not os.path.exists(csv_path)
    with open(csv_path, "a", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=record.keys())
        if write_header:
            writer.writeheader()
        writer.writerow(record)

# Loading & Preprocessing Data

In [4]:
from pyvi.ViTokenizer import ViTokenizer
from keras.src.legacy.preprocessing.text import Tokenizer
from keras.src.utils import pad_sequences

# Đọc dữ liệu từ tệp
def load_data(en_file, vi_file):
    with open(en_file, 'r', encoding='utf-8') as f:
        en_data = f.read().strip().split("\n")
    with open(vi_file, 'r', encoding='utf-8') as f:
        vi_data = f.read().strip().split("\n")
    return en_data, vi_data

def get_tokenize(data, add_start_end=False):
    # Khởi tạo Tokenizer
    tokenizer = Tokenizer(filters='', oov_token=UNKNOWN_TOKEN)
    if (add_start_end):
        tokenizer.fit_on_texts([START_TOKEN, END_TOKEN] + data)
    else:
        tokenizer.fit_on_texts(data)
    return data, tokenizer

def get_tokenize_seq(en_data, vi_data, en_tokenizer, vi_tokenizer, max_sequence_length):
    en_data = [f"{START_TOKEN} {sentence} {END_TOKEN}" for sentence in en_data]
    en_sequences = en_tokenizer.texts_to_sequences(en_data)

    vi_data = [ViTokenizer.tokenize(sentence) for sentence in vi_data]
    vi_sequences = vi_tokenizer.texts_to_sequences(vi_data)

    filtered_en = []
    filtered_vi = []
    # Giữ lại những câu có số từ <= max_sequence_length
    for i in range(len(en_sequences)):
        if (len(en_sequences[i]) <= max_sequence_length) and (len(vi_sequences[i]) <= max_sequence_length):
            filtered_en.append(en_sequences[i])
            filtered_vi.append(vi_sequences[i])

    filtered_en = torch.tensor(pad_sequences(filtered_en, maxlen=max_sequence_length, padding='post'), dtype=torch.long)
    filtered_vi = torch.tensor(pad_sequences(filtered_vi, maxlen=max_sequence_length, padding='post'), dtype=torch.long)

    return filtered_en, filtered_vi

# Tiền xử lý dữ liệu
def preprocess_tokenizer(en_data, vi_data):
    en_data, en_tokenizer = get_tokenize(en_data, add_start_end=True)

    vi_data = [ViTokenizer.tokenize(sentence) for sentence in vi_data]
    vi_data, vi_tokenizer = get_tokenize(vi_data)

    return en_tokenizer, vi_tokenizer

def preprocess_data(train_src_path, train_trg_path, val_src_path, val_trg_path):
    # Load dữ liệu
    en_data, vi_data = load_data(train_src_path, train_trg_path)
    en_data_val, vi_data_val = load_data(val_src_path, val_trg_path)

    en_tokenizer, vi_tokenizer = preprocess_tokenizer(en_data, vi_data)

    en_sequences, vi_sequences = get_tokenize_seq(en_data, vi_data, en_tokenizer, vi_tokenizer,
                                                  max_sequence_length=MAX_SEQ_LEN)
    en_val_sequences, vi_val_sequences = get_tokenize_seq(en_data_val, vi_data_val, en_tokenizer, vi_tokenizer,
                                                          max_sequence_length=MAX_SEQ_LEN)

    all_train_sequences = list(zip(vi_sequences, en_sequences))
    all_val_sequences = list(zip(vi_val_sequences, en_val_sequences))

    return en_tokenizer, vi_tokenizer, all_train_sequences, all_val_sequences

def merge_sentences(text, max_seq_length):
    sentences = [s.strip() for s in text.split(",")]  # Tách câu và xóa khoảng trắng dư thừa

    merged = []
    temp = ""
    word_count = 0

    for sentence in sentences:
        words = sentence.split()  # Đếm số từ trong câu hiện tại
        if word_count + len(words) <= max_seq_length:
            temp = temp + ", " + sentence if temp else sentence  # Nối câu
            word_count += len(words)  # Cập nhật số từ
        else:
            merged.append(temp)  # Lưu câu hiện tại vào danh sách
            temp = sentence  # Bắt đầu câu mới
            word_count = len(words)  # Reset số từ

    if temp:  # Đừng quên thêm câu cuối cùng
        merged.append(temp)

    return merged

2025-12-14 03:49:18.136591: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1765684158.322235      47 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1765684158.395157      47 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

# Transformers

In [5]:
from torch import nn


class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.attention = ScaleDotProductAttention()
        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)
        self.w_concat = nn.Linear(d_model, d_model)

    def forward(self, query, key, value, mask=None):
        # 1. dot product with weight matrices
        query, key, value = self.w_q(query), self.w_k(key), self.w_v(value)

        # 2. split tensor by number of heads
        query, key, value = self.split(query), self.split(key), self.split(value)

        # 3. do scale dot product to compute similarity
        out, attention = self.attention(query, key, value, mask=mask)

        # 4. concat and pass to linear layer
        out = self.concat(out)
        out = self.w_concat(out)

        # 5. visualize attention map
        # TODO : we should implement visualization
        return out

    def split(self, tensor):
        batch_size, length, d_model = tensor.size()

        d_tensor = d_model // self.num_heads
        tensor = tensor.view(batch_size, length, self.num_heads, d_tensor).transpose(1, 2)
        # it is similar with group convolution (split by number of heads)

        return tensor

    def concat(self, tensor):
        batch_size, num_heads, length, d_tensor = tensor.size()
        d_model = d_tensor * self.num_heads

        tensor = tensor.transpose(1, 2).contiguous().view(batch_size, length, d_model)
        return tensor

In [6]:
from torch.optim.lr_scheduler import _LRScheduler

class CustomLearningRateSchedule(_LRScheduler):
    def __init__(self, optimizer, initial_lr, decay_rates, decay_steps, lr_decay_interval, last_epoch=-1):
        """
        initial_lr: Learning rate ban đầu
        decay_rates: Danh sách hệ số decay (n phần tử)
        decay_steps: Danh sách step ứng với decay (n-1 phần tử)
        lr_decay_interval: Khoảng cách giữa các lần decay
        """
        assert len(decay_rates) - 1 == len(decay_steps), "Số lượng decay_steps phải ít hơn decay_rates một phần tử"

        self.initial_lr = initial_lr
        self.decay_rates = decay_rates
        self.decay_steps = decay_steps
        self.lr_decay_interval = lr_decay_interval
        self.prev_decay_step = 0

        super().__init__(optimizer, last_epoch)

    def get_lr(self):
        step = self.last_epoch
        lr = self.initial_lr
        prev_decay_step = 0

        # Áp dụng các decay ban đầu
        for i in range(len(self.decay_steps)):
            decay_factor = self.decay_rates[i]
            num_intervals = max((min(step, self.decay_steps[i]) - prev_decay_step) // self.lr_decay_interval, 0)
            lr *= decay_factor ** num_intervals
            prev_decay_step = self.decay_steps[i]

        # Áp dụng decay cuối cùng mãi mãi
        decay_factor = self.decay_rates[-1]
        num_intervals = max((step - prev_decay_step) // self.lr_decay_interval, 0)
        lr *= decay_factor ** num_intervals

        return [lr for _ in self.base_lrs]  # Trả về danh sách cho từng group của optimizer

    def state_dict(self):
        return {
            "initial_lr": self.initial_lr,
            "decay_rates": self.decay_rates,
            "decay_steps": self.decay_steps,
            "lr_decay_interval": self.lr_decay_interval,
            "prev_decay_step": self.prev_decay_step
        }

    def load_state_dict(self, state_dict):
        self.initial_lr = state_dict["initial_lr"]
        self.decay_rates = state_dict["decay_rates"]
        self.decay_steps = state_dict["decay_steps"]
        self.lr_decay_interval = state_dict["lr_decay_interval"]
        self.prev_decay_step = state_dict["prev_decay_step"]

In [7]:
import torch
from torch import nn
import math

class ScaleDotProductAttention(nn.Module):
    def __init__(self):
        super(ScaleDotProductAttention, self).__init__()
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, query, key, value, mask=None):
        # input is 4 dimension tensor
        # [batch_size, num_heads, length, d_tensor]
        batch_size, num_heads, length, d_tensor = key.size()

        # 1. dot product Query with Key^T to compute similarity
        key_t = key.transpose(2, 3)
        score = (query @ key_t) / math.sqrt(d_tensor)

        # 2. apply masking (opt)
        if mask is not None:
            score = score.masked_fill(mask == 0, -100000000)

        # 3. pass them softmax to make [0, 1] range
        score = self.softmax(score)

        # 4. multiply with Value
        value = score @ value

        return value, score

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len, device):
        """
           constructor of sinusoid encoding class

           :param d_model: dimension of model
           :param max_len: max sequence length
           :param device: hardware device setting
        """
        super(PositionalEncoding, self).__init__()

        # same size with input matrix (for adding with input matrix)
        self.encoding = torch.zeros(max_len, d_model, device=device)
        self.encoding.requires_grad = False # we don't need to compute gradient

        pos = torch.arange(0, max_len, device=device)
        pos = pos.float().unsqueeze(dim=1)

        _2i = torch.arange(0, d_model, 2, device=device).float()

        self.encoding[:, 0::2] = torch.sin(pos / (10000 ** (_2i / d_model)))
        self.encoding[:, 1::2] = torch.cos(pos / (10000 ** (_2i / d_model)))
        # compute positional encoding to consider positional information of words

    def forward(self, x):
        batch_size, seq_len = x.size()
        return self.encoding[:seq_len, :]

class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout):
        super(PositionwiseFeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.linear2(x)
        return x

class TransformerEmbedding(nn.Module):
    def __init__(self, vocab_size, d_model, max_len, dropout, device):
        super(TransformerEmbedding, self).__init__()
        self.tok_emb = nn.Embedding(vocab_size, d_model, padding_idx=PAD_TOKEN_POS)
        self.pos_emb = PositionalEncoding(d_model, max_len, device)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        tok_emb = self.tok_emb(x)
        pos_emb = self.pos_emb(x)
        return self.dropout(tok_emb + pos_emb)


In [8]:
from torch import nn

class EncoderLayer(nn.Module):
    def __init__(self, d_model, d_ff, num_heads, dropout):
        super(EncoderLayer, self).__init__()
        self.attention = MultiHeadAttention(d_model, num_heads)
        self.norm1 = nn.LayerNorm(d_model, eps=EPS)
        self.dropout1 = nn.Dropout(dropout)

        self.ffn = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.norm2 = nn.LayerNorm(d_model, eps=EPS)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, x, src_mask):
        # 1. compute self attention
        _x = x
        x = self.attention(x, x, x, src_mask)

        # 2. add and norm
        x = self.dropout1(x)
        x = self.norm1(_x + x)

        # 3. positionwise feed forward network
        _x = x
        x = self.ffn(x)

        # 4. add and norm
        x = self.dropout2(x)
        x = self.norm2(_x + x)

        return x

class Encoder(nn.Module):
    def __init__(self, inp_vocab_size, max_len, d_model, d_ff, num_heads, num_layers, dropout, device):
        super(Encoder, self).__init__()
        self.emb = TransformerEmbedding(inp_vocab_size, d_model, max_len, dropout, device=device)
        self.layers = nn.ModuleList([EncoderLayer(d_model, d_ff, num_heads, dropout) for _ in range(num_layers)])

    def forward(self, src, src_mask):
        x = self.emb(src)
        for layer in self.layers:
            x = layer(x, src_mask)

        return x

In [9]:
from torch import nn

class Decoder_Layer(nn.Module):
    def __init__(self, d_model, d_ff, num_heads, dropout):
        super(Decoder_Layer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.norm1 = nn.LayerNorm(d_model, eps=EPS)
        self.dropout1 = nn.Dropout(dropout)

        self.enc_dec_attn = MultiHeadAttention(d_model, num_heads)
        self.norm2 = nn.LayerNorm(d_model, eps=EPS)
        self.dropout2 = nn.Dropout(dropout)

        self.ffn = PositionwiseFeedForward(d_model, d_ff, DROPOUT)
        self.norm3 = nn.LayerNorm(d_model, eps=EPS)
        self.dropout3 = nn.Dropout(dropout)

    def forward(self, x, enc_out, trg_mask, src_mask):
        # 1. compute self attention
        _x = x
        x = self.self_attn(x, x, x, mask=trg_mask)

        # 2. add and norm
        x = self.dropout1(x)
        x = self.norm1(_x + x)

        if enc_out is not None:
            # 3. compute encoder - decoder attention
            _x = x
            x = self.enc_dec_attn(x, enc_out, enc_out, mask=src_mask)

            # 4. add and norm
            x = self.dropout2(x)
            x = self.norm2(_x + x)

        # 5. positionwise feed forward network
        _x = x
        x = self.ffn(x)

        # 6. add and norm
        x = self.dropout3(x)
        x = self.norm3(_x + x)

        return x

class Decoder(nn.Module):
    def __init__(self, trg_vocab_size, max_len, d_model, d_ff, num_heads, num_layers, dropout, device):
        super(Decoder, self).__init__()
        self.embedding = TransformerEmbedding(trg_vocab_size, d_model, max_len, dropout, device)
        self.layers = nn.ModuleList([Decoder_Layer(d_model, d_ff, num_heads, dropout) for i in range(num_layers)])
        self.linear = nn.Linear(d_model, trg_vocab_size)

    def forward(self, trg, enc_src, trg_mask, src_mask):
        trg = self.embedding(trg)

        for layer in self.layers:
            trg = layer(trg, enc_src, trg_mask, src_mask)

        # pass to LM head
        output = self.linear(trg)

        return output


In [10]:
import torch
from torch import nn

class Transformer(nn.Module):
    def __init__(self, src_pad_idx, trg_pad_idx, inp_vocab_size, trg_vocab_size, d_model, num_heads, max_len, d_ff, num_layers, dropout, device):
        super(Transformer, self).__init__()
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device

        self.encoder = Encoder(inp_vocab_size, max_len, d_model, d_ff, num_heads, num_layers, dropout, device)
        self.decoder = Decoder(trg_vocab_size, max_len, d_model, d_ff, num_heads, num_layers, dropout, device)

    def forward(self, src, trg):
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        enc_out = self.encoder(src, src_mask)
        output = self.decoder(trg, enc_out, trg_mask, src_mask)
        return output

    def make_src_mask(self, src):
        src_mask = (src != self.src_pad_idx).unsqueeze(dim=1).unsqueeze(dim=2)
        return src_mask

    def make_trg_mask(self, trg):
        trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(dim=1).unsqueeze(dim=3)
        trg_len = trg.shape[1]
        trg_look_ahead_mask = torch.tril(torch.ones(trg_len, trg_len)).bool().to(self.device)
        trg_mask = trg_pad_mask & trg_look_ahead_mask

        return trg_mask

# Training & Evaluation

In [11]:
import math
import time

from torch import nn, optim
from torch.utils.data import DataLoader

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.kaiming_uniform(m.weight.data)

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

en_tokenizer, vi_tokenizer, all_train_sequences, all_val_sequences = preprocess_data(
                                                                            train_data_path + "train.en.txt", train_data_path + "train.vi.txt",
                                                                            data_path + "tst2013.en.txt", data_path + "tst2013.vi.txt")

# Create training and validation set batches.
train_batches = DataLoader(all_train_sequences, batch_size=BATCH_SIZE, shuffle=True)
val_batches = DataLoader(all_val_sequences, batch_size=BATCH_SIZE, shuffle=False)

# Kích thước từ vựng
en_vocab_size = len(en_tokenizer.word_index) + 1
vi_vocab_size = len(vi_tokenizer.word_index) + 1

# Initializing model
model = Transformer(
    src_pad_idx=PAD_TOKEN_POS,
    trg_pad_idx=PAD_TOKEN_POS,
    d_model=D_MODEL,
    inp_vocab_size=vi_vocab_size,
    trg_vocab_size=en_vocab_size,
    max_len=MAX_SEQ_LEN,
    d_ff=D_FF,
    num_heads=NUM_HEADS,
    num_layers=NUM_LAYERS,
    dropout=DROPOUT,
    device=DEVICE
).to(DEVICE)

print(f'The model has {count_parameters(model):,} trainable parameters')

# Tạo optimizer
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

scheduler = CustomLearningRateSchedule(
    optimizer=optimizer,
    initial_lr=LEARNING_RATE,
    decay_rates=DECAY_RATE,
    decay_steps=DECAY_STEP,
    lr_decay_interval=DECAY_INTERVAL
)

criterion = nn.CrossEntropyLoss(ignore_index=PAD_TOKEN_POS)

def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    total_correct = 0
    total_tokens = 0
    for (i, (src, trg)) in enumerate(iterator):
        src = src.to(model.device)  # Đưa src về cùng thiết bị với model
        trg = trg.to(model.device)  # Đưa trg về cùng thiết bị với model
        optimizer.zero_grad()
        output = model(src, trg[:, :-1])
        output_reshape = output.contiguous().view(-1, output.shape[-1])
        trg = trg[:, 1:].contiguous().view(-1)

        loss = criterion(output_reshape, trg)
        loss.backward()
        # Tính norm của gradient trước khi clip
        grad_norm_before = torch.sqrt(sum(p.grad.norm()**2 for p in model.parameters() if p.grad is not None))
        # Clip gradient để tránh exploding gradient
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        # Tính norm của gradient sau khi clip
        grad_norm_after = torch.sqrt(sum(p.grad.norm()**2 for p in model.parameters() if p.grad is not None))
        optimizer.step()
        scheduler.step()

        # Tính số lượng token đúng
        pred = output.argmax(dim=-1).view(-1)  # Lấy token có xác suất cao nhất
        mask = (trg != PAD_TOKEN_POS)  # Bỏ qua token padding
        correct = (pred == trg) & mask  # Đúng và không phải padding
        total_correct += correct.sum().item()
        total_tokens += mask.sum().item()
        
        epoch_loss += loss.item()
        if (i + 1) % BATCH_PRINT == 0:
            lr = optimizer.param_groups[0]['lr']
            print(f'Batch: {i+1}/{len(iterator)}, Loss: {loss.item():.4f}, Accuracy: {total_correct / total_tokens:.4f}, LR: {lr:.6f}, '
                  f'Grad Norm Before Clip: {grad_norm_before:.6f}, Grad Norm After Clip: {grad_norm_after:.6f}')
            
    return epoch_loss / len(iterator), total_correct / total_tokens

def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    total_correct = 0
    total_tokens = 0
    with torch.no_grad():
        for (i, (src, trg)) in enumerate(iterator):
            src = src.to(model.device)  # Đưa src về cùng thiết bị với model
            trg = trg.to(model.device)  # Đưa trg về cùng thiết bị với model
            output = model(src, trg[:, :-1])
            output_reshape = output.contiguous().view(-1, output.shape[-1])
            trg = trg[:, 1:].contiguous().view(-1)

            # Tính số lượng token đúng
            pred = output.argmax(dim=-1).view(-1)  # Lấy token có xác suất cao nhất
            mask = (trg != PAD_TOKEN_POS)  # Bỏ qua token padding
            correct = (pred == trg) & mask  # Đúng và không phải padding
            total_correct += correct.sum().item()
            total_tokens += mask.sum().item()

            loss = criterion(output_reshape, trg)
            epoch_loss += loss.item()

    return epoch_loss / len(iterator), total_correct / total_tokens

def run(total_epoch, best_loss):
    train_losses, test_losses = [], []
    for step in range(total_epoch):
        print(f'Epoch: {step + 1}')
        start_time = time.time()
        train_loss, train_accuracy = train(model, train_batches, optimizer, criterion, CLIP)
        val_loss, val_accuracy = evaluate(model, val_batches, criterion)
        end_time = time.time()

        train_losses.append(train_loss)
        test_losses.append(val_loss)

        epoch_mins, epoch_secs = epoch_time(start_time, end_time)
        if val_loss < best_loss:
            best_loss = val_loss
            torch.save(model.state_dict(), f'{saved_model_path}/model-{val_loss:.3f}-{val_accuracy:.3f}.pt')

        # Log record
        log_record = {
            "epoch": step + 1,
            "train_loss": round(train_loss, 6),
            "train_accuracy": round(train_accuracy, 6),
            "train_ppl": round(math.exp(train_loss), 6),
            "val_loss": round(val_loss, 6),
            "val_accuracy": round(val_accuracy, 6),
            "val_ppl": round(math.exp(val_loss), 6),
            "epoch_time_sec": round(end_time - start_time, 2)
        }

        log_epoch(log_record, JSON_LOG_PATH, CSV_LOG_PATH)

        #Console
        print(f'Epoch: {step + 1} | Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train Accuracy: {train_accuracy:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
        print(f'\tVal Loss: {val_loss:.3f} | Val Accuracy: {val_accuracy:.3f} |  Val PPL: {math.exp(val_loss):7.3f}')

# run(total_epoch=EPOCHS, best_loss=float('inf'))

The model has 111,941,877 trainable parameters


In [16]:
from deap import base, creator, tools, algorithms
import random

def train_model(learning_rate, dropout, weight_decay, batch_size, quick_epoch=True):
    """
    Optimize performance with GA
    """
    # Tao dataloader voi batch_size
    train_loader = DataLoader(all_train_sequences, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(all_val_sequences, batch_size=batch_size,shuffle=False)

    #Khoi tao model Transformer voi dropout
    model = Transformer(
        src_pad_idx=PAD_TOKEN_POS,
        trg_pad_idx=PAD_TOKEN_POS,
        inp_vocab_size=vi_vocab_size,
        trg_vocab_size=en_vocab_size,
        d_model=D_MODEL,
        num_heads=NUM_HEADS,
        max_len=MAX_SEQ_LEN,
        d_ff=D_FF,
        num_layers=NUM_LAYERS,
        dropout=dropout,
        device=DEVICE
    ).to(DEVICE)

    #khoi tao optim, scheduler voi learning_rate, weight_decay
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    scheduler = CustomLearningRateSchedule(
        optimizer=optimizer,
        initial_lr=learning_rate,
        decay_rates=DECAY_RATE,
        decay_steps=DECAY_STEP,
        lr_decay_interval=DECAY_INTERVAL
    )

    criterion = nn.CrossEntropyLoss(ignore_index=PAD_TOKEN_POS)

    epochs = 10
    for epoch in range(epochs):
        train(model, train_loader, optimizer, criterion, CLIP)
    # Đánh giá accuracy trên tập validation
    _, val_acc = evaluate(model, val_loader, criterion)
    return val_acc

In [17]:

# -------------------------------------------------
# 1. Define the fitness of an individual
# -------------------------------------------------
# Fitness: maximize validation accuracy

random.seed(seed)
torch.manual_seed(seed)

creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)


# -------------------------------------------------
# 2. Hyperparameter space
# -------------------------------------------------
def create_individual():
    return creator.Individual([
        random.uniform(1e-5, 5e-4),   # learning_rate
        random.uniform(0.05, 0.3),    # dropout
        random.choice([1e-5,2e-5,5e-5,1e-4,2e-4,5e-4]),     # weight_decay
        random.choice([128, 144, 160, 176, 164,192, 208, 224, 256])   # batch_size
    ])

def cx_individual(ind1, ind2):
    # lr (float)
    ind1[0], ind2[0] = tools.cxBlend(ind1[0:1], ind2[0:1], alpha=0.5)
    
    # dropout (float)
    ind1[1], ind2[1] = tools.cxBlend(ind1[1:2], ind2[1:2], alpha=0.5)

    # weight_decay (discrete)
    if random.random() < 0.5:
        ind1[2], ind2[2] = ind2[2], ind1[2]

    # batch_size (int)
    if random.random() < 0.5:
        ind1[3], ind2[3] = ind2[3], ind1[3]

    return ind1, ind2

def mut_individual(ind):
    # learning_rate
    if random.random() < 0.3:
        ind[0] *= random.uniform(0.5, 1.5)
        ind[0] = max(1e-5, min(ind[0], 5e-4))

    # dropout
    if random.random() < 0.3:
        ind[1] += random.uniform(-0.05, 0.05)
        ind[1] = max(0.05, min(ind[1], 0.3))

    # weight_decay
    if random.random() < 0.2:
        ind[2] = random.choice([1e-5,2e-5,5e-5,1e-4,2e-4,5e-4])

    # batch_size
    if random.random() < 0.2:
        ind[3] = random.choice([128, 144, 160, 176, 192, 208, 224, 256])

    return (ind,)

    
# -------------------------------------------------
# 3. Evaluation function (train few epochs)
# -------------------------------------------------
def evaluate(individual):
    lr, dropout, wd, bs = individual

    lr = float(lr)
    dropout = float(dropout)
    wd = float(wd)
    bs = int(bs)

    # Call your train() for 1–3 quick epochs
    val_acc = train_model(
        learning_rate=lr,
        dropout=dropout,
        weight_decay=wd,
        batch_size=bs,
        quick_epoch=True
    )

    return (val_acc,)


# -------------------------------------------------
# 4. GA setup
# -------------------------------------------------
toolbox = base.Toolbox()
toolbox.register("individual", create_individual)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("evaluate", evaluate)
toolbox.register("mate", cx_individual)
toolbox.register("mutate",mut_individual)
toolbox.register("select", tools.selTournament, tournsize=3)

# -------------------------------------------------
# 5. Run GA
# -------------------------------------------------
def run_ga():
    pop = toolbox.population(n=20)   # 20 candidates
    ngen = 10                        # 10 generations

    algorithms.eaSimple(
        pop,
        toolbox,
        cxpb=0.5,      # crossover probability
        mutpb=0.3,     # mutation probability
        ngen=ngen,
        verbose=True
    )

    best = tools.selBest(pop, k=1)[0]
    print("Best hyperparameters:", best)
    return best

best_params = run_ga()



Batch: 100/795, Loss: 5.3139, Accuracy: 0.1699, LR: 0.000339, Grad Norm Before Clip: 0.865263, Grad Norm After Clip: 0.865263
Batch: 200/795, Loss: 4.7659, Accuracy: 0.2169, LR: 0.000339, Grad Norm Before Clip: 0.991328, Grad Norm After Clip: 0.991328
Batch: 300/795, Loss: 4.4438, Accuracy: 0.2527, LR: 0.000339, Grad Norm Before Clip: 1.213187, Grad Norm After Clip: 0.999999
Batch: 400/795, Loss: 4.2633, Accuracy: 0.2788, LR: 0.000339, Grad Norm Before Clip: 0.868490, Grad Norm After Clip: 0.868490
Batch: 500/795, Loss: 3.9699, Accuracy: 0.2991, LR: 0.000339, Grad Norm Before Clip: 1.059566, Grad Norm After Clip: 0.999999
Batch: 600/795, Loss: 3.8776, Accuracy: 0.3156, LR: 0.000339, Grad Norm Before Clip: 0.965310, Grad Norm After Clip: 0.965310
Batch: 700/795, Loss: 3.5537, Accuracy: 0.3298, LR: 0.000339, Grad Norm Before Clip: 0.921641, Grad Norm After Clip: 0.921641
Batch: 100/795, Loss: 3.3295, Accuracy: 0.4511, LR: 0.000339, Grad Norm Before Clip: 0.908435, Grad Norm After Clip: 0

KeyboardInterrupt: 

In [None]:
# # Huấn luyện mô hình cuối cùng với các siêu tham số tốt nhất
# BATCH_SIZE = best_bs
# LEARNING_RATE = best_lr
# WEIGHT_DECAY = best_wd
# DROPOUT = best_dropout

# # Tạo lại DataLoader với batch size tốt nhất
# train_batches = DataLoader(all_train_sequences, batch_size=BATCH_SIZE, shuffle=True)
# val_batches = DataLoader(all_val_sequences, batch_size=BATCH_SIZE, shuffle=False)

# # Khởi tạo lại mô hình với dropout tốt nhất
# model = Transformer(
#     src_pad_idx=PAD_TOKEN_POS,
#     trg_pad_idx=PAD_TOKEN_POS,
#     inp_vocab_size=vi_vocab_size,
#     trg_vocab_size=en_vocab_size,
#     d_model=D_MODEL,
#     num_heads=NUM_HEADS,
#     max_len=MAX_SEQ_LEN,
#     d_ff=D_FF,
#     num_layers=NUM_LAYERS,
#     dropout=DROPOUT,
#     device=DEVICE
# ).to(DEVICE)
# optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
# scheduler = CustomLearningRateSchedule(
#     optimizer=optimizer,
#     initial_lr=LEARNING_RATE,
#     decay_rates=DECAY_RATE,
#     decay_steps=DECAY_STEP,
#     lr_decay_interval=DECAY_INTERVAL
# )
# criterion = nn.CrossEntropyLoss(ignore_index=PAD_TOKEN_POS)

# # Chạy huấn luyện đầy đủ với EPOCHS
# run(total_epoch=EPOCHS, best_loss=float('inf'))

# # Tính và báo cáo độ chính xác trên tập validation cuối cùng
# _, final_val_acc = evaluate(model, val_batches, criterion)
# print(f"Final validation accuracy with best hyperparameters: {final_val_acc:.4f}")