In [None]:
import os
os.chdir("..")
from src import (
    VietnameseTrainer,
    VietnamesePreprocessor,
    VietnameseTextDataset,
    VietnameseTokenizer,
    VietnameseTransformer,
    prepare_vietnamese_dataset,
    test_generation,
    load_texts_from_folder,
)

In [None]:
def setup_training_config():
    """Setup training configuration"""
    config = {
        # Data configuration
        "data_folder": "train_data_1",
        "tokenizer_file": "vietnamese_tokenizer.json",
        "vocab_size": 25000,
        "max_seq_len": 512,
        "train_split": 0.8,
        # Model configuration
        "d_model": 768,
        "n_heads": 12,
        "n_layers": 12,
        "d_ff": 3072,
        "dropout": 0.1,
        # Training configuration
        "batch_size": 16,
        "learning_rate": 3e-5,
        "weight_decay": 0.01,
        "num_epochs": 10,
        "warmup_steps": 5000,
        "device": "auto",  # 'cuda', 'cpu', or 'auto'
        # Generation configuration
        "temperature": 0.8,
        "top_k": 10,
        "top_p": 0.9,
        "max_new_tokens": 512,
        # Save configuration
        "model_save_path": "vietnamese_transformer_best.pt",
        "config_save_path": "training_config.json",
    }
    return config

In [None]:
def load_tokenizer(tokenizer_path: str) -> VietnameseTokenizer:
    tokenizer = VietnameseTokenizer()
    tokenizer.load(tokenizer_path)
    return tokenizer

In [None]:
config = setup_training_config()
vietnam_tokenizer = load_tokenizer(config["tokenizer_file"])
tokenizer = vietnam_tokenizer.tokenizer
model = VietnameseTransformer(
    vocab_size=tokenizer.get_vocab_size(),
    d_model=config["d_model"],
    n_heads=config["n_heads"],
    n_layers=config["n_layers"],
    d_ff=config["d_ff"],
    max_seq_len=config["max_seq_len"],
    dropout=config["dropout"],
    pad_token_id=tokenizer.token_to_id("[PAD]"),
)

In [None]:
import torch
if os.path.exists(config["model_save_path"]):
    checkpoint = torch.load(
        "vietnamese_transformer_best.pt",
        map_location="cpu",
        weights_only=False,
    )
    model.load_state_dict(checkpoint["model_state_dict"])
    print("✅ Loaded best model for testing")
    
def test(test_cases: list[str], max_new_tokens: int):
    test_generation(model, tokenizer, device="cpu", test_cases=test_cases, max_new_tokens=max_new_tokens)

In [None]:
test(test_cases=["thơ lục bát: mùa đông để mộng nằm im "], max_new_tokens=150)