# BÀI THỰC HÀNH 5: KIẾN TRÚC TRANSFORMER ENCODER

In [6]:
!pip install pyvi --quiet

import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import math
from torch.utils.data import Dataset, DataLoader
from pyvi import ViTokenizer
from collections import Counter
from sklearn.preprocessing import LabelEncoder

# Thiết lập thiết bị (GPU nếu có)
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Đang chạy trên: {DEVICE}")

Đang chạy trên: cuda


### Bài 1: Xây dựng mô hình Transformer Encoder gồm 3 lớp theo mô tả trong nghiên cứu [Attention is all you need](https://proceedings.neurips.cc/paper_files/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf). Huấn luyện mô hình này cho bài toán phân loại domain câu bình luận trên bộ dữ liệu [UIT-ViOCD](https://drive.google.com/drive/folders/1Lu9axyLkw7dMx80uLRgvCnZsmNzhJWAa?usp=sharing).

In [8]:
MAX_LEN = 128
BATCH_SIZE = 32
VOCAB_SIZE = 20000
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_df = pd.read_json('train.json', orient='index')
dev_df = pd.read_json('dev.json', orient='index')
test_df = pd.read_json('test.json', orient='index')

TEXT_COL = 'review'
LABEL_COL = 'domain'

print(f"Số dòng Train: {len(train_df)} | Mẫu: {train_df[TEXT_COL].iloc[0][:50]}...")

def tokenizer(text):
    return ViTokenizer.tokenize(str(text).lower()).split()

all_train_words = []
for text in train_df[TEXT_COL]:
    all_train_words.extend(tokenizer(text))

word_counts = Counter(all_train_words)
vocab_list = ['<PAD>', '<UNK>'] + [w for w, c in word_counts.most_common(VOCAB_SIZE)]
vocab_to_int = {w: i for i, w in enumerate(vocab_list)}
PAD_IDX, UNK_IDX = 0, 1
REAL_VOCAB_SIZE = len(vocab_to_int)

le = LabelEncoder()
le.fit(pd.concat([train_df[LABEL_COL], dev_df[LABEL_COL], test_df[LABEL_COL]]))
NUM_CLASSES = len(le.classes_)

print(f"Vocab={REAL_VOCAB_SIZE} | Số nhãn={NUM_CLASSES}")

class ViOCDDataset(Dataset):
    def __init__(self, df, vocab, max_len):
        self.texts = df[TEXT_COL].values
        self.labels = le.transform(df[LABEL_COL].values)
        self.vocab = vocab
        self.max_len = max_len
    def __len__(self): return len(self.texts)
    def __getitem__(self, idx):
        tokens = tokenizer(self.texts[idx])
        encoded = [self.vocab.get(w, UNK_IDX) for w in tokens]
        if len(encoded) < self.max_len: encoded += [PAD_IDX] * (self.max_len - len(encoded))
        else: encoded = encoded[:self.max_len]
        return torch.tensor(encoded, dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.long)

train_loader = DataLoader(ViOCDDataset(train_df, vocab_to_int, MAX_LEN), batch_size=BATCH_SIZE, shuffle=True)
dev_loader = DataLoader(ViOCDDataset(dev_df, vocab_to_int, MAX_LEN), batch_size=BATCH_SIZE)
test_loader = DataLoader(ViOCDDataset(test_df, vocab_to_int, MAX_LEN), batch_size=BATCH_SIZE)

Số dòng Train: 4387 | Mẫu: gói hàng cẩn thận . chơi pubg với liên q...
Vocab=6358 | Số nhãn=4


In [9]:
print(f"Số dòng Dev: {len(dev_df)} | Mẫu: {dev_df[TEXT_COL].iloc[0][:50]}...")
print(f"Số dòng Test: {len(test_df)} | Mẫu: {test_df[TEXT_COL].iloc[0][:50]}...")

Số dòng Dev: 548 | Mẫu: quần đẹp nhưng cỡ lồn hơi ngắn...
Số dòng Test: 549 | Mẫu: dẹp xinh giá rẻ , đáng mua...


In [10]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))
    def forward(self, x): return x + self.pe[:, :x.size(1), :]

class TransformerBlock(nn.Module):
    def __init__(self, d_model, num_heads, ff_dim, dropout):
        super().__init__()
        self.attention = nn.MultiheadAttention(d_model, num_heads, batch_first=True)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, ff_dim), nn.ReLU(),
            nn.Dropout(dropout), nn.Linear(ff_dim, d_model)
        )
        self.dropout = nn.Dropout(dropout)
    def forward(self, x, mask=None):
        attn_out, _ = self.attention(x, x, x, key_padding_mask=mask)
        x = self.norm1(x + self.dropout(attn_out))
        x = self.norm2(x + self.dropout(self.ffn(x)))
        return x

class ViOCDTransformer(nn.Module):
    def __init__(self, vocab_size, d_model, num_heads, ff_dim, num_layers, num_classes, dropout=0.3):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model, padding_idx=PAD_IDX)
        self.pos_encoding = PositionalEncoding(d_model, MAX_LEN)
        self.layers = nn.ModuleList([
            TransformerBlock(d_model, num_heads, ff_dim, dropout) for _ in range(num_layers)
        ])
        self.fc = nn.Linear(d_model, num_classes)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        mask = (x == PAD_IDX)
        x = self.dropout(self.embedding(x) * math.sqrt(x.size(-1)))
        x = self.pos_encoding(x)
        for layer in self.layers:
            x = layer(x, mask=mask)
        return self.fc(x.mean(dim=1))

# Khởi tạo mô hình 3 lớp Encoder
model = ViOCDTransformer(
    vocab_size=REAL_VOCAB_SIZE,
    d_model=128, num_heads=8, ff_dim=512,
    num_layers=3, num_classes=NUM_CLASSES
).to(DEVICE)

optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-2)
criterion = nn.CrossEntropyLoss()

In [11]:
print(">>> Bắt đầu huấn luyện...")
for epoch in range(15):
    model.train()
    t_loss, t_correct = 0, 0
    for texts, labels in train_loader:
        texts, labels = texts.to(DEVICE), labels.to(DEVICE)
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        t_loss += loss.item()
        t_correct += (outputs.argmax(1) == labels).sum().item()

    model.eval()
    v_correct = 0
    with torch.no_grad():
        for texts, labels in dev_loader:
            texts, labels = texts.to(DEVICE), labels.to(DEVICE)
            v_correct += (model(texts).argmax(1) == labels).sum().item()

    print(f"Epoch {epoch+1:02} | Loss: {t_loss/len(train_loader):.4f} | Train Acc: {t_correct/len(train_df)*100:6.2f}% | Val Acc: {v_correct/len(dev_df)*100:6.2f}%")

model.eval()
test_correct = 0
with torch.no_grad():
    for texts, labels in test_loader:
        texts, labels = texts.to(DEVICE), labels.to(DEVICE)
        test_correct += (model(texts).argmax(1) == labels).sum().item()

print(f"\nĐỘ CHÍNH XÁC TRÊN TẬP TEST: {test_correct/len(test_df)*100:.2f}%")

>>> Bắt đầu huấn luyện...
Epoch 01 | Loss: 1.0571 | Train Acc:  58.49% | Val Acc:  72.26%
Epoch 02 | Loss: 0.7341 | Train Acc:  72.65% | Val Acc:  77.92%
Epoch 03 | Loss: 0.6144 | Train Acc:  77.05% | Val Acc:  81.20%
Epoch 04 | Loss: 0.5449 | Train Acc:  79.62% | Val Acc:  83.58%
Epoch 05 | Loss: 0.4828 | Train Acc:  82.43% | Val Acc:  84.12%
Epoch 06 | Loss: 0.4467 | Train Acc:  84.02% | Val Acc:  85.40%
Epoch 07 | Loss: 0.4595 | Train Acc:  83.38% | Val Acc:  85.77%
Epoch 08 | Loss: 0.4007 | Train Acc:  85.46% | Val Acc:  87.41%
Epoch 09 | Loss: 0.3800 | Train Acc:  85.96% | Val Acc:  86.50%
Epoch 10 | Loss: 0.3613 | Train Acc:  86.98% | Val Acc:  87.59%
Epoch 11 | Loss: 0.3556 | Train Acc:  87.14% | Val Acc:  87.23%
Epoch 12 | Loss: 0.3464 | Train Acc:  87.39% | Val Acc:  87.96%
Epoch 13 | Loss: 0.3168 | Train Acc:  88.65% | Val Acc:  87.96%
Epoch 14 | Loss: 0.3142 | Train Acc:  88.74% | Val Acc:  86.86%
Epoch 15 | Loss: 0.3079 | Train Acc:  89.38% | Val Acc:  86.68%

ĐỘ CHÍNH XÁC 

### Bài 2: Xây dựng mô hình Transformer Encoder gồm 3 lớp theo mô tả trong nghiên cứu [Attention is all you need](https://proceedings.neurips.cc/paper_files/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf). Huấn luyện mô hình này cho bài toán gán nhãn chuỗi trên bộ dữ liệu [PhoNERT](https://github.com/VinAIResearch/PhoNER_COVID19).

In [14]:
!pip install pyvi --quiet
import os
import json

if not os.path.exists('PhoNER_COVID19'):
    !git clone https://github.com/VinAIResearch/PhoNER_COVID19.git

def load_phonert_json_correct(split):
    path = f'PhoNER_COVID19/data/syllable/{split}_syllable.json'
    sentences = []
    tags = []

    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line:
                item = json.loads(line)
                sentences.append(item['words'])
                tags.append(item['tags'])
    return sentences, tags

train_sents, train_tags = load_phonert_json_correct('train')
dev_sents, dev_tags = load_phonert_json_correct('dev')
test_sents, test_tags = load_phonert_json_correct('test')

print(f"Train: {len(train_sents)} câu.")
print(f"Ví dụ: {train_sents[0][:5]}... -> {train_tags[0][:5]}...")

Train: 5027 câu.
Ví dụ: ['Đồng', 'thời', ',', 'bệnh', 'viện']... -> ['O', 'O', 'O', 'O', 'O']...


In [15]:
# Xây dựng Vocab
all_words = [w for s in train_sents for w in s]
word_counts = Counter(all_words)
vocab = ['<PAD>', '<UNK>'] + [w for w, c in word_counts.most_common(20000)]
word2idx = {w: i for i, w in enumerate(vocab)}

# Lấy tất cả các nhãn duy nhất từ tập Train
all_tags_list = sorted(list(set([t for s in train_tags for t in s])))
tag2idx = {t: i for i, t in enumerate(all_tags_list)}
idx2tag = {i: t for t, i in tag2idx.items()}
NUM_TAGS = len(tag2idx)

MAX_LEN = 128
BATCH_SIZE = 32
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class PhoNERDataset(Dataset):
    def __init__(self, sents, tags, word2idx, tag2idx, max_len):
        self.sents = sents
        self.tags = tags
        self.word2idx = word2idx
        self.tag2idx = tag2idx
        self.max_len = max_len

    def __len__(self): return len(self.sents)

    def __getitem__(self, idx):
        w_ids = [self.word2idx.get(w, 1) for w in self.sents[idx]] # 1 là <UNK>
        t_ids = [self.tag2idx[t] for t in self.tags[idx]]

        # Padding
        if len(w_ids) < self.max_len:
            w_ids += [0] * (self.max_len - len(w_ids))
            t_ids += [-100] * (self.max_len - len(t_ids))
        else:
            w_ids = w_ids[:self.max_len]
            t_ids = t_ids[:self.max_len]

        return torch.tensor(w_ids), torch.tensor(t_ids)

train_loader = DataLoader(PhoNERDataset(train_sents, train_tags, word2idx, tag2idx, MAX_LEN), batch_size=BATCH_SIZE, shuffle=True)
dev_loader = DataLoader(PhoNERDataset(dev_sents, dev_tags, word2idx, tag2idx, MAX_LEN), batch_size=BATCH_SIZE)
test_loader = DataLoader(PhoNERDataset(test_sents, test_tags, word2idx, tag2idx, MAX_LEN), batch_size=BATCH_SIZE)

In [16]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))
    def forward(self, x): return x + self.pe[:, :x.size(1), :]

class TransformerBlock(nn.Module):
    def __init__(self, d_model, nhead, dim_ff, dropout):
        super().__init__()
        self.attn = nn.MultiheadAttention(d_model, nhead, batch_first=True)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, dim_ff), nn.ReLU(),
            nn.Dropout(dropout), nn.Linear(dim_ff, d_model)
        )
        self.dropout = nn.Dropout(dropout)
    def forward(self, x, mask=None):
        attn_out, _ = self.attn(x, x, x, key_padding_mask=mask)
        x = self.norm1(x + self.dropout(attn_out))
        x = self.norm2(x + self.dropout(self.ffn(x)))
        return x

class TransformerNER(nn.Module):
    def __init__(self, vocab_size, num_tags, d_model=128, nhead=8, num_layers=3, dim_ff=512, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model, padding_idx=0)
        self.pos_encoding = PositionalEncoding(d_model, MAX_LEN)

        # Khối 3 lớp Encoder
        self.encoder_layers = nn.ModuleList([
            TransformerBlock(d_model, nhead, dim_ff, dropout) for _ in range(num_layers)
        ])

        self.fc = nn.Linear(d_model, num_tags)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        mask = (x == 0)
        x = self.dropout(self.embedding(x) * math.sqrt(x.size(-1)))
        x = self.pos_encoding(x)

        for layer in self.encoder_layers:
            x = layer(x, mask=mask)

        return self.fc(x) # Output: [Batch, SeqLen, NumTags]

model = TransformerNER(len(vocab), NUM_TAGS).to(DEVICE)
criterion = nn.CrossEntropyLoss(ignore_index=-100)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

In [17]:
def train_ner(model, loader):
    model.train()
    total_loss = 0
    for w, t in loader:
        w, t = w.to(DEVICE), t.to(DEVICE)
        optimizer.zero_grad()
        out = model(w)
        # Reshape cho CrossEntropy: [Batch * SeqLen, NumTags]
        loss = criterion(out.view(-1, NUM_TAGS), t.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def eval_ner(model, loader):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for w, t in loader:
            w, t = w.to(DEVICE), t.to(DEVICE)
            out = model(w).argmax(dim=-1)
            mask = (t != -100) # Chỉ tính các vị trí nhãn thực
            correct += (out[mask] == t[mask]).sum().item()
            total += mask.sum().item()
    return correct / total

print(">>> Bắt đầu huấn luyện PhoNER...")
for epoch in range(10):
    loss = train_ner(model, train_loader)
    acc = eval_ner(model, dev_loader)
    print(f"Epoch {epoch+1:02} | Loss: {loss:.4f} | Dev Acc: {acc*100:.2f}%")

test_acc = eval_ner(model, test_loader)
print(f"\nĐỘ CHÍNH XÁC TRÊN TẬP TEST: {test_acc*100:.2f}%")

>>> Bắt đầu huấn luyện PhoNER...
Epoch 01 | Loss: 1.0790 | Dev Acc: 73.71%
Epoch 02 | Loss: 0.7733 | Dev Acc: 78.13%
Epoch 03 | Loss: 0.6264 | Dev Acc: 81.00%
Epoch 04 | Loss: 0.5287 | Dev Acc: 82.89%
Epoch 05 | Loss: 0.4727 | Dev Acc: 83.71%
Epoch 06 | Loss: 0.4355 | Dev Acc: 84.67%
Epoch 07 | Loss: 0.4069 | Dev Acc: 85.23%
Epoch 08 | Loss: 0.3823 | Dev Acc: 85.94%
Epoch 09 | Loss: 0.3654 | Dev Acc: 86.26%
Epoch 10 | Loss: 0.3450 | Dev Acc: 86.72%

ĐỘ CHÍNH XÁC TRÊN TẬP TEST: 85.98%
