In [None]:
import os
os.chdir("..")
from src.dataset import VietnameseTextDataset, prepare_vietnamese_dataset, load_texts_from_folder
from tokenizers import Tokenizer
import torch
from glob import glob
import torch.nn as nn

In [None]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()   # split on whitespace
trainer = BpeTrainer(vocab_size=1000,
                     special_tokens=["[PAD]", "[UNK]", "[EOS]"])
files = glob(os.path.join("./data", "*.txt"))
files
tokenizer.train(files, trainer)
tokenizer.save("vietnamese_bpe_tokenizer.json")

In [None]:
tokenizer = Tokenizer.from_file("vietnamese_bpe_tokenizer.json")

In [None]:
files = glob(os.path.join("./data", "*.txt"))
files

In [None]:
sentences = []

for data_file in files:
    with open(data_file, 'r', encoding='utf-8') as f:
        raw_text = f.read()
        sentences.append(raw_text)
# sentences[]

In [None]:
from src.tokenizer import VietnamesePreprocessor
import random
preprocessor = VietnamesePreprocessor()

In [None]:
raw_texts = load_texts_from_folder("data")
all_sentences = []
for text in raw_texts:
    cleaned_text = preprocessor.clean_text(text)
    sentences_from_file = preprocessor.segment_sentences(cleaned_text)
    all_sentences.extend(sentences_from_file) # Use extend to add all sentences to one list

train_split = 0.8
random.shuffle(all_sentences)
split_idx = int(len(all_sentences) * train_split)
train_sentences = all_sentences[:split_idx]
val_sentences = all_sentences[split_idx:]

In [None]:
train_sentences[:3]

In [None]:
train_data = VietnameseTextDataset(texts=train_sentences, tokenizer=tokenizer, max_length=128, stride=64)
train_data.__getitem__(1)

In [None]:
train_data.__getitem__(5)

In [None]:
data = VietnameseTextDataset(texts=sentences, tokenizer=tokenizer, max_length=128, stride=94)

In [None]:
data.__getitem__(0)

In [None]:
raw_text = load_texts_from_folder("data")
raw_text

In [None]:
train_loader, val_loader, _ = prepare_vietnamese_dataset(data_folder="data", tokenizer=tokenizer)

In [None]:
# Test the data loader
for i, batch in enumerate(train_loader):
    if i >= 2:  # Only show first 2 batches
        break
        
    print(f"\nBatch {i + 1}:")
    print(f"  Input IDs shape: {batch['input_ids'].shape}")
    print(f"  Target IDs shape: {batch['target_ids'].shape}")
    print(f"  Attention mask shape: {batch['attention_mask'].shape}")
    
    # Show first sequence in batch
    input_seq = batch['input_ids'][2]
    target_seq = batch['target_ids'][2]
    
    print(f"  Sample input:  {input_seq[:50].tolist()}...")
    print(f"  Sample target: {target_seq[:50].tolist()}...")
    
    # Decode sample
    decoded_input = tokenizer.decode(input_seq.tolist())
    decoded_target = tokenizer.decode(target_seq.tolist())
    
    print(f"  Decoded input:  '{decoded_input[:50]}...'")
    print(f"  Decoded target: '{decoded_target[:50]}...'")

In [None]:
config = {
    # Data configuration
    'data_folder': 'data1',
    'tokenizer_file': 'vietnamese_bpe_tokenizer.json',
    'vocab_size': 5000,
    'max_seq_len': 128,
    'train_split': 0.8,
    
    # Model configuration
    'd_model': 512,
    'n_heads': 8,
    'n_layers': 6,
    'd_ff': 2048,
    'dropout': 0.1,
    
    # Training configuration
    'batch_size': 16,
    'learning_rate': 1e-4,
    'weight_decay': 0.01,
    'num_epochs': 50,
    'warmup_steps': 1000,
    'device': 'auto',  # 'cuda', 'cpu', or 'auto'
    
    # Generation configuration
    'temperature': 0.8,
    'top_k': 50,
    'top_p': 0.9,
    'max_new_tokens': 50,
    
    # Save configuration
    'model_save_path': 'vietnamese_transformer_best.pt',
    'config_save_path': 'training_config.json'
}

In [None]:
from train import VietnameseTransformer

model = VietnameseTransformer(
    vocab_size=tokenizer.get_vocab_size(),
    d_model=config['d_model'],
    n_heads=config['n_heads'],
    n_layers=config['n_layers'],
    d_ff=config['d_ff'],
    max_seq_len=config['max_seq_len'],
    dropout=config['dropout'],
    pad_token_id=tokenizer.token_to_id("[PAD]")
)
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

model.to("cuda")
print(f"✅ Model created successfully!")
print(f"   Total parameters: {total_params:,}")
print(f"   Trainable parameters: {trainable_params:,}")
print(f"   Model size: {total_params * 4 / 1024 / 1024:.2f} MB (float32)")

In [None]:
from train import VietnameseTrainer
from trainer import test_generation
# Step 3: Initialize trainer
print(f"\n{'='*20} STEP 3: TRAINING SETUP {'='*20}")

trainer = VietnameseTrainer(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    tokenizer=tokenizer,
    lr=config['learning_rate'],
    weight_decay=config['weight_decay'],
    warmup_steps=config['warmup_steps'],
    device=config['device']
)

print(f"✅ Trainer initialized!")
print(f"   Device: {trainer.device}")
print(f"   Learning rate: {config['learning_rate']}")
print(f"   Batch size: {config['batch_size']}")

# Test initial generation (before training)
print(f"\n{'='*20} INITIAL GENERATION TEST {'='*20}")
print("Testing generation before training (should be random):")
test_generation(model, tokenizer, trainer.device, ["Truyện Kiều được viết"])

In [None]:
# Step 4: Train the model
print(f"\n{'='*20} STEP 4: TRAINING {'='*20}")
print(f"Starting training for {config['num_epochs']} epochs...")
print("Press Ctrl+C to stop training early\n")

try:
    trainer.train(
        num_epochs=config['num_epochs'],
        save_path=config['model_save_path']
    )
    
    print(f"\n🎉 Training completed successfully!")
    
except KeyboardInterrupt:
    print(f"\n⏹️  Training interrupted by user")
    print("Saving current model state...")
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': trainer.optimizer.state_dict(),
        'train_losses': trainer.train_losses,
        'val_losses': trainer.val_losses,
        'tokenizer': tokenizer
    }, 'vietnamese_transformer_interrupted.pt')
    print("Model saved as 'vietnamese_transformer_interrupted.pt'")

In [None]:
# Step 6: Final generation test
print(f"\n{'='*20} STEP 6: FINAL GENERATION TEST {'='*20}")
print(config['model_save_path'])
# Load best model for testing
if os.path.exists(config['model_save_path']):
    checkpoint = torch.load(config['model_save_path'], map_location=trainer.device, weights_only=True)
    model.load_state_dict(checkpoint['model_state_dict'])
    print("✅ Loaded best model for testing")

In [None]:
def test_generation(model, tokenizer, device, test_cases=None):
    """Test text generation with various examples"""
    if test_cases is None:
        test_cases = [
            "Truyện Kiều là",
        ]
    
    print("\n" + "="*60)
    print("🎯 TESTING TEXT GENERATION")
    print("="*60)
    
    model.eval()
    
    for i, prompt in enumerate(test_cases, 1):
        print(f"\n--- Test {i} ---")
        print(f"Input: '{prompt}'")
        
        # Encode input
        input_ids = torch.tensor(
            [tokenizer.encode(prompt, add_special_tokens=False).ids],
            device=device
        )
        
        # Generate with different settings
        generation_configs = [
            {'temperature': 0.7, 'top_k': 50, 'top_p': 0.9, 'max_new_tokens': 15, 'name': 'Balanced'},
            {'temperature': 1.0, 'top_k': 20, 'top_p': 0.8, 'max_new_tokens': 15, 'name': 'Creative'},
            {'temperature': 0.0, 'top_k': 5, 'top_p': 1.0, 'max_new_tokens': 15, 'name': 'Conservative'}
        ]
        
        for config in generation_configs:
            with torch.no_grad():
                generated = model.generate(
                    input_ids,
                    temperature=config['temperature'],
                    top_k=config['top_k'],
                    top_p=config['top_p'],
                    max_new_tokens=config['max_new_tokens'],
                    do_sample=True
                )
            
            generated_text = tokenizer.decode(generated[0].cpu().tolist())
            print(f"  {config['name']}: '{generated_text}'")

In [None]:
test_generation(model, tokenizer, device="cuda", test_cases = ["Truyện Kiều"])