In [1]:

import torch
print(torch.cuda.is_available(), torch.cuda.get_device_name(0))

True NVIDIA GeForce RTX 3080


In [12]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

# Apply Rotary Positional Embedding
def apply_rope(q, k):
    # q: (B, H, T_q, D), k: (B, H, T_kv, D)
    B, H, T_q, D = q.size()
    T_kv = k.size(2)
    assert D % 2 == 0, "head_dim phải chẵn để áp dụng RoPE"

    device = q.device
    half_dim = D // 2

    # Tạo sinusoidal pos embedding cho từng chiều dài
    pos_q = torch.arange(T_q, device=device).unsqueeze(1)  # (T_q, 1)
    pos_k = torch.arange(T_kv, device=device).unsqueeze(1)  # (T_kv, 1)
    dim = torch.arange(half_dim, device=device).unsqueeze(0)  # (1, D/2)
    freq = 1.0 / (10000 ** (dim / half_dim))  # (1, D/2)

    angle_q = pos_q * freq  # (T_q, D/2)
    angle_k = pos_k * freq  # (T_kv, D/2)

    sin_q = angle_q.sin()[None, None, :, :]  # (1, 1, T_q, D/2)
    cos_q = angle_q.cos()[None, None, :, :]  # (1, 1, T_q, D/2)
    sin_k = angle_k.sin()[None, None, :, :]  # (1, 1, T_kv, D/2)
    cos_k = angle_k.cos()[None, None, :, :]  # (1, 1, T_kv, D/2)

    def rotate(x, sin, cos):
        x1 = x[..., ::2]  # even index
        x2 = x[..., 1::2]  # odd index
        x_rot = torch.cat([x1 * cos - x2 * sin, x1 * sin + x2 * cos], dim=-1)
        return x_rot

    return rotate(q, sin_q, cos_q), rotate(k, sin_k, cos_k)


class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)  # (max_len, d_model)
        pos = torch.arange(0, max_len).unsqueeze(1).float()  # (max_len,1)
        div = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(pos * div)
        pe[:, 1::2] = torch.cos(pos * div)
        pe = pe.unsqueeze(0)  # (1, max_len, d_model)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return x

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0
        self.head_dim = d_model // num_heads
        self.num_heads = num_heads
        self.q_proj = nn.Linear(d_model, d_model)
        self.k_proj = nn.Linear(d_model, d_model)
        self.v_proj = nn.Linear(d_model, d_model)
        self.out_proj = nn.Linear(d_model, d_model)

    def forward(self, x_q, x_kv, mask=None):
        B, T_q, _ = x_q.size()
        T_kv = x_kv.size(1)
        q = self.q_proj(x_q).view(B, T_q, self.num_heads, self.head_dim).transpose(1, 2)  # (B, H, T_q, D)
        k = self.k_proj(x_kv).view(B, T_kv, self.num_heads, self.head_dim).transpose(1, 2)  # (B, H, T_kv, D)
        v = self.v_proj(x_kv).view(B, T_kv, self.num_heads, self.head_dim).transpose(1, 2)  # (B, H, T_kv, D)

        # Áp dụng Rotary Positional Embedding tại đây (nếu muốn)
        q, k = apply_rope(q, k)

        attn = (q @ k.transpose(-2, -1)) / self.head_dim**0.5  # scaled dot-product
        if mask is not None:
            attn = attn.masked_fill(mask == 0, float('-inf'))
        attn = attn.softmax(dim=-1)
        out = attn @ v  # (B, H, T_q, D)
        out = out.transpose(1, 2).reshape(B, T_q, -1)
        return self.out_proj(out)

class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(d_ff, d_model),
        )
    def forward(self, x): return self.net(x)

class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.ff = FeedForward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        # Sửa lại: chỉ truyền x, x, mask
        attn = self.self_attn(x, x, mask)
        x = self.norm1(x + self.dropout(attn))
        ff = self.ff(x)
        x = self.norm2(x + self.dropout(ff))
        return x

class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.enc_attn  = MultiHeadAttention(d_model, num_heads)
        self.ff        = FeedForward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_out, src_mask=None, tgt_mask=None):
        # Sửa lại: chỉ truyền x, x, tgt_mask
        attn1 = self.self_attn(x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn1))
        # Sửa lại: truyền x, enc_out, src_mask
        attn2 = self.enc_attn(x, enc_out, src_mask)
        x = self.norm2(x + self.dropout(attn2))
        ff = self.ff(x)
        x = self.norm3(x + self.dropout(ff))
        return x

class Transformer(nn.Module):
    def __init__(self, src_vocab, tgt_vocab, d_model=512, num_heads=8, d_ff=2048,
                 num_encoder=6, num_decoder=6, dropout=0.1, max_len=512):
        super().__init__()
        self.src_tok_emb = nn.Embedding(src_vocab, d_model)
        self.tgt_tok_emb = nn.Embedding(tgt_vocab, d_model)
        self.enc_layers = nn.ModuleList(
            [EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_encoder)]
        )
        self.dec_layers = nn.ModuleList(
            [DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_decoder)]
        )
        self.fc_out = nn.Linear(d_model, tgt_vocab)

    def make_src_mask(self, src):
        return (src != 0).unsqueeze(1).unsqueeze(2)  # (B,1,1,src_len)

    def make_tgt_mask(self, tgt):
        B, tgt_len = tgt.size()
        pad_mask = (tgt != 0).unsqueeze(1).unsqueeze(2)  # (B,1,1,tgt_len)
        subseq_mask = torch.tril(torch.ones((tgt_len, tgt_len), device=tgt.device)).bool()
        return pad_mask & subseq_mask  # (B,1,tgt_len,tgt_len)

    def forward(self, src, tgt):
        src_mask = self.make_src_mask(src)
        tgt_mask = self.make_tgt_mask(tgt)
        # Embedding + Positional
        enc = self.src_tok_emb(src)
        for layer in self.enc_layers:
            enc = layer(enc, src_mask)
        dec = self.tgt_tok_emb(tgt)
        for layer in self.dec_layers:
            dec = layer(dec, enc, src_mask, tgt_mask)
        out = self.fc_out(dec)  # (B, tgt_len, tgt_vocab)
        return out


In [4]:
from torch.utils.data import Dataset

class VLDataset(Dataset):
    def __init__(self, src_lines, tgt_lines, spm_model, max_len=128):
        import sentencepiece as spm
        self.sp = spm.SentencePieceProcessor(model_file=spm_model)
        self.src, self.tgt = src_lines, tgt_lines
        self.max_len = max_len

    def __len__(self): return len(self.src)

    def __getitem__(self, i):
        src_ids = self.sp.encode(self.src[i])[:self.max_len]
        tgt_ids = self.sp.encode(self.tgt[i])[:self.max_len]
        return {
            "src": torch.tensor([1] + src_ids + [2]),   # <s>=1, </s>=2
            "tgt": torch.tensor([1] + tgt_ids + [2])
        }


In [None]:
# Cẩn thận
import sentencepiece as spm
# Chuẩn bị file đầu vào chứa cả Lào và Việt (hoặc 2 file riêng)
spm.SentencePieceTrainer.train(
    input='data/Train/train2023.vi,data/Train/train2023.lo',
    model_prefix='vietlao_spm', vocab_size=32000, character_coverage=1.0)

In [5]:
from torch.utils.data import DataLoader

def collate_fn(batch):
    srcs = [b["src"] for b in batch]
    tgts = [b["tgt"] for b in batch]
    src_pad = nn.utils.rnn.pad_sequence(srcs, padding_value=0, batch_first=True)
    tgt_pad = nn.utils.rnn.pad_sequence(tgts, padding_value=0, batch_first=True)
    return src_pad, tgt_pad

# Example data, replace with your actual data loading
with open('./data/Train/train2023.vi', encoding='utf-8') as f:
    train_vi = [line.strip() for line in f if line.strip()]
with open('./data/Train/train2023.lo', encoding='utf-8') as f:
    train_lo = [line.strip() for line in f if line.strip()]

train_ds = VLDataset(train_vi, train_lo, "./vietlao_spm.model")
train_loader = DataLoader(train_ds, batch_size=64, shuffle=True,
                          num_workers=0, pin_memory=True, collate_fn=collate_fn)


In [3]:
# Kích thước từ điển
import sentencepiece as spm
sp = spm.SentencePieceProcessor(model_file='./vietlao_spm.model')
vocab_size = sp.get_piece_size()
src_vocab = vocab_size
tgt_vocab = vocab_size

In [6]:
import sentencepiece as spm
sp = spm.SentencePieceProcessor(model_file='./vietlao_spm.model')
vocab_size = sp.get_piece_size()
print("Vocab size:", vocab_size)

Vocab size: 32000


In [13]:
model = Transformer(src_vocab, tgt_vocab).cuda()
model = nn.DataParallel(model)  # sẽ tự chia batch trên tất cả GPU; với 1 GPU cũng hoạt động bình thường
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, betas=(0.9,0.98), eps=1e-9)
criterion = nn.CrossEntropyLoss(ignore_index=0)

scaler = torch.cuda.amp.GradScaler()  # Khởi tạo scaler (chỉ cần 1 lần, ngoài vòng lặp)

for epoch in range(1, 11):
    model.train()
    total_loss = 0
    for src, tgt in train_loader:
        src, tgt = src.cuda(), tgt.cuda()
        tgt_in = tgt[:, :-1]
        tgt_out = tgt[:, 1:]

        optimizer.zero_grad()

        # Bật autocast để dùng mixed precision
        with torch.cuda.amp.autocast():
            output = model(src, tgt_in)  # forward
            loss = criterion(output.reshape(-1, output.size(-1)), tgt_out.reshape(-1))

        # backward + step qua GradScaler
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()

    print(f"Epoch {epoch}, Loss: {total_loss/len(train_loader):.4f}")


  scaler = torch.cuda.amp.GradScaler()  # Khởi tạo scaler (chỉ cần 1 lần, ngoài vòng lặp)
  with torch.cuda.amp.autocast():


KeyboardInterrupt: 

In [None]:
# Lưu checkpoint
torch.save({
    'epoch': epoch,
    'model_state': model.state_dict(),
    'optimizer_state': optimizer.state_dict(),
    'scaler_state': scaler.state_dict(),
}, "checkpoints/checkpoint_epoch10.pt")


In [None]:
# Lưu model
torch.save(model.state_dict(), f"checkpoints/transformer_epoch{epoch}.pt")


In [18]:
# Load model đã train
model = Transformer(src_vocab, tgt_vocab)  # Khởi tạo lại kiến trúc như lúc train
model = nn.DataParallel(model)
model.load_state_dict(torch.load("checkpoints/transformer_epoch10.pt"))
model.eval()
model.cuda()


DataParallel(
  (module): Transformer(
    (src_tok_emb): Embedding(32000, 512)
    (tgt_tok_emb): Embedding(32000, 512)
    (pos_enc): PositionalEncoding()
    (enc_layers): ModuleList(
      (0-5): 6 x EncoderLayer(
        (self_attn): MultiHeadAttention(
          (w_q): Linear(in_features=512, out_features=512, bias=True)
          (w_k): Linear(in_features=512, out_features=512, bias=True)
          (w_v): Linear(in_features=512, out_features=512, bias=True)
          (fc_out): Linear(in_features=512, out_features=512, bias=True)
        )
        (ff): FeedForward(
          (net): Sequential(
            (0): Linear(in_features=512, out_features=2048, bias=True)
            (1): ReLU()
            (2): Dropout(p=0.1, inplace=False)
            (3): Linear(in_features=2048, out_features=512, bias=True)
          )
        )
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (dropou

In [13]:
# Load Tokenizer
import sentencepiece as spm

sp = spm.SentencePieceProcessor()
sp.load("vietlao_spm.model")

BOS_ID = sp.bos_id()
EOS_ID = sp.eos_id()
PAD_ID = sp.pad_id()


In [14]:
def translate_vi_to_lo(sentence, model, sp, max_len=50):
    model.eval()
    with torch.no_grad():
        # 1. Tokenize tiếng Việt
        src_ids = [BOS_ID] + sp.encode(sentence) + [EOS_ID]
        src_tensor = torch.LongTensor(src_ids).unsqueeze(0).cuda()  # (1, src_len)

        # 2. Tạo target đầu vào với chỉ BOS
        tgt_ids = [BOS_ID]
        for _ in range(max_len):
            tgt_tensor = torch.LongTensor(tgt_ids).unsqueeze(0).cuda()  # (1, tgt_len)
            with torch.cuda.amp.autocast():
                output = model(src_tensor, tgt_tensor)
            next_token = output[0, -1].argmax(-1).item()
            if next_token == EOS_ID:
                break
            tgt_ids.append(next_token)

        # 3. Decode tiếng Lào
        return sp.decode(tgt_ids[1:])  # bỏ BOS


In [17]:
sentence = "con mèo khi trông ngon quá"
translated = translate_vi_to_lo(sentence, model, sp)
print("Dịch:", translated)


Dịch: ແມວເມື່ອແຊບ


  with torch.cuda.amp.autocast():


In [29]:
import sacrebleu

# Load dữ liệu test
with open("data/Dev/dev2023.vi", encoding="utf-8") as f:
    src_sentences = [line.strip() for line in f if line.strip()]
with open("data/Dev/dev2023.lo", encoding="utf-8") as f:
    ref_sentences = [line.strip() for line in f if line.strip()]

# Dịch toàn bộ
hyp_sentences = [translate_vi_to_lo(sent, model, sp) for sent in src_sentences]

# Tính BLEU
bleu = sacrebleu.corpus_bleu(hyp_sentences, [ref_sentences])
print(f"BLEU: {bleu.score:.2f}")


  with torch.cuda.amp.autocast():


BLEU: 16.80
