# **I &nbsp;&nbsp;&nbsp; Khai báo thư viện**

In [1]:
import math
import torch
import torch.optim as optim
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt

from data.dataset import PhoMTDataset
from services.models.tokenizer.tokenizer import Tokenizer
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from services.machine_translator import MachineTranslator, MachineTranslatorConfig

# **II &nbsp;&nbsp;&nbsp; Bộ dữ liệu**

In [2]:
dataset = PhoMTDataset("en-vi")

In [3]:
N = 30000
train_size = int(0.8 * N)
test_size = int(0.1 * N)
dev_size = int(0.1 * N)

train_set = {
    "source": dataset.train()["source"][:train_size],
    "target": dataset.train()["target"][:train_size]
}
test_set = {
    "source": dataset.test()["source"][:test_size],
    "target": dataset.test()["target"][:test_size]
}
dev_set = {
    "source": dataset.dev()["source"][:dev_size],
    "target": dataset.dev()["target"][:dev_size]
}

# **III &nbsp;&nbsp;&nbsp; Mã hóa dữ liệu**

In [4]:
tokenizer = Tokenizer("en-vi")

In [5]:
max_seq_length = 192
train_data = tokenizer.tokenize(train_set, max_seq_length)
test_data = tokenizer.tokenize(test_set, max_seq_length)
dev_data = tokenizer.tokenize(dev_set, max_seq_length)

# **IV &nbsp;&nbsp;&nbsp; Mô hình**

## **1 &nbsp;&nbsp;&nbsp; Hyperparameters**

In [None]:
# block_size: int = 192
# vocab_size: int = 32000
# n_layer: int = 6
# n_head: int = 6
# n_embd: int = 384
# dropout: float = 0.2
# bias: bool = True
# Nhóm thực hiện thực nghiệm bằng cách tinh chỉnh các tham số trên bằng phương pháp thủ công

config = MachineTranslatorConfig()
translator = MachineTranslator(config)

Số lượng tham số: 37.14M


In [7]:
optimizer = optim.AdamW(translator.parameters(), lr=3e-4, weight_decay=0.01)

In [None]:
batch_size = 64

def get_batch(split):
    data = train_data if split == "train" else dev_data
    ix = torch.randint(0, len(data["source"]), (batch_size, ))
    x = torch.stack([data["source"][i] for i in ix])
    y = torch.stack([data["target"][i] for i in ix])
    return x, y

In [None]:
eval_epochs = 20

@torch.no_grad()
def estimate_loss():
    out = {}
    
    translator.eval()
    for split in ["train", "dev"]:
        losses = torch.zeros(eval_epochs)
        
        for k in range(eval_epochs):
            X, y = get_batch(split)
            _, loss = translator(X, y[:, :-1], y[:, 1:])
                
            losses[k] = loss.item()
            
        out[split] = losses.mean()
    translator.train()
    
    return out

## **2 &nbsp;&nbsp;&nbsp; Huấn luyện mô hình**

In [None]:
epochs = 10000
eval_interval = 500
train_losses = []
dev_losses = []

In [None]:
for epoch in range(epochs):
    if epoch % eval_interval == 0 or epoch == epochs - 1:
        losses = estimate_loss()
        train_losses.append(losses["train"].item())
        dev_losses.append(losses["dev"].item())
        print(f"step {epoch}: train loss {losses['train']:.4f}, dev loss {losses['dev']:.4f}")
        
    X_b, y_b = get_batch("train")
            
    logits, loss = translator(X_b, y_b[:, :-1], y_b[:, 1:])
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

## **3 &nbsp;&nbsp;&nbsp; Đánh giá mô hình**

In [None]:
# Đồ thị mất mát của mô hình
steps = [0] + list(range(0, epochs, eval_interval))

plt.figure(figsize=(8, 4))

plt.plot(steps, losses["train"], linewidth=1.5, label="Train Loss")
plt.plot(steps, losses["dev"], linewidth=1.5, label="Validation Loss")

plt.xlabel("Iteration")
plt.ylabel("Loss")
plt.title("Training and Validation Loss")
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# Chỉ số Perplexity trên tập test
translator.eval()
@torch.no_grad
def test_loss():
    losses = []
    for i in range(0, len(test_data["source"]), batch_size):
        x = test_data["source"][i:i+batch_size]
        y = test_data["target"][i:i+batch_size]
        _, loss = translator(x, y[:, :-1], y[:, 1:])
        losses.append(loss.item())

    return torch.tensor(losses, dtype=torch.float32).mean()

ppl = math.exp(test_loss())

In [None]:
# Chỉ số BLEU trên tập test
@torch.no_grad
def bleu_score_corpus(references, hypotheses):
    smoothie = SmoothingFunction().method4
    refs = [[r.lower().split()] for r in references]
    hypos = [h.lower().split() for h in hypotheses]
    return corpus_bleu(
        refs,
        hypos,
        weights=(0.25, 0.25, 0.25, 0.25),
        smoothing_function=smoothie
    )
    
hypotheses = [translator.translate(t, max_seq_length) for t in test_set["source"]]
print(bleu_score_corpus(test_data["target"], hypotheses))

## **4 &nbsp;&nbsp;&nbsp; Lưu trọng số mô hình**

In [None]:
#torch.save(translator.state_dict(), "services/translator_weights/weights.pth")