In [None]:
import os
os.chdir("..")

In [None]:

from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()   # split on whitespace
trainer = BpeTrainer(vocab_size=10000,
                     special_tokens=["[PAD]", "[UNK]", "[EOS]"])
files = ["data1/truyen_kieu.txt", ""]  # your raw text files
tokenizer.train(files, trainer)


In [None]:
# tokenizer.save("vietnamese_bpe_tokenizer.json")

In [None]:
import torch
import torch.nn as nn
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        # Create constant positional encoding matrix
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer("pe", pe.unsqueeze(0))  # shape (1, max_len, d_model)
    def forward(self, x):
        # x shape: (batch_size, seq_len, d_model)
        x = x + self.pe[:, :x.size(1)]
        return x

class TransformerBlock(nn.Module):
    def __init__(self, d_model, n_heads, d_ff, dropout=0.1):
        super().__init__()
        self.attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.ff = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, d_model),
            nn.Dropout(dropout)
        )
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, attn_mask=None):
        # x shape: (seq_len, batch_size, d_model)
        attn_out, _ = self.attn(x, x, x, attn_mask=attn_mask)  # self-attention
        x = x + self.dropout(attn_out)       # residual + dropout
        x = self.norm1(x)
        ff_out = self.ff(x)
        x = x + self.dropout(ff_out)         # residual + dropout
        x = self.norm2(x)
        return x

class GPTModel(nn.Module):
    def __init__(self, vocab_size, d_model=256, n_heads=4, d_ff=1024, n_layers=4, max_seq_len=256):
        super().__init__()
        self.token_emb = nn.Embedding(vocab_size, d_model)
        self.pos_enc = PositionalEncoding(d_model, max_len=max_seq_len)
        self.layers = nn.ModuleList(
            TransformerBlock(d_model, n_heads, d_ff) for _ in range(n_layers)
        )
        self.ln_f = nn.LayerNorm(d_model)
        self.head = nn.Linear(d_model, vocab_size)
    
    def forward(self, x):
        # x shape: (batch_size, seq_len)
        bsz, seq_len = x.size()
        # embed tokens and add positional encoding
        tok_emb = self.token_emb(x)                     # (bsz, seq_len, d_model)
        x = self.pos_enc(tok_emb)
        # Prepare for MultiheadAttention: needs (seq_len, bsz, d_model)
        x = x.transpose(0, 1)
        # Causal mask to ensure autoregressive (upper-triangular -inf)
        mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).bool().to(x.device)
        mask = mask.masked_fill(mask, float('-inf'))
        for layer in self.layers:
            x = layer(x, attn_mask=mask)
        x = self.ln_f(x)
        x = x.transpose(0, 1)  # back to (bsz, seq_len, d_model)
        logits = self.head(x)  # (bsz, seq_len, vocab_size)
        return logits

In [None]:
# train.py
import os
import math
import random
from glob import glob
from typing import List

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from tqdm import tqdm

# -------------------------
# Hyperparameters (tweak)
# -------------------------
VOCAB_SIZE = None  # set after loading tokenizer
SEQ_LEN = 512       # context length (adjust: 128-512)
BATCH_SIZE = 32     # try 16-64 depending on GPU memory
NUM_EPOCHS = 10
LR = 3e-4
WEIGHT_DECAY = 1e-2
WARMUP_STEPS = 200
MAX_GRAD_NORM = 1.0
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SAVE_DIR = "checkpoints"
CHECKPOINT_EVERY = 500   # steps
VALIDATION_SPLIT = 0.05  # small val set
USE_FP16 = True          # mixed precision (autocast) if supported

os.makedirs(SAVE_DIR, exist_ok=True)

# -------------------------
# Dataset utils
# -------------------------
class TextDataset(Dataset):
    """
    Build dataset by concatenating all token ids and sampling contiguous windows.
    This is memory-efficient for small datasets; for huge corpora you'd stream.
    """
    def __init__(self, token_files: List[str], tokenizer: Tokenizer, seq_len: int, stride: int = None):
        """
        token_files: list of raw text file paths (UTF-8)
        tokenizer: tokenizer object with .encode(text)->List[int]
        seq_len: desired example length (model context)
        stride: if None -> random sampling windows; else sliding window with stride
        """
        self.seq_len = seq_len
        self.tokenizer = tokenizer

        # read + encode all files into one long list of token ids
        ids = []
        for fn in token_files:
            text = open(fn, "r", encoding="utf-8").read().strip()
            if not text:
                continue
            enc = tokenizer.encode(text, add_special_tokens=False)
            # optionally add [EOS] after each file to separate lessons
            eos_id = tokenizer.token_to_id("[EOS]") if hasattr(tokenizer, "token_to_id") else tokenizer.token_to_id("[EOS]")
            if eos_id is None:
                raise ValueError("Tokenizer must have [EOS]")
            
            # print("ENC =>>>>>>>>>", type(enc), enc.ids)
            # print("ENC TOKENS =>>>>>>>>>", type(enc), tokenizer.decode(enc.ids))
            # print("ENC ID =>>>>>>>>>", type(eos_id), eos_id)
            enc = enc.ids + [eos_id]
            ids.extend(enc)
        self.ids = ids

        # create start indices for windows
        if stride is None:
            # we will sample random windows on the fly
            self.starts = None
        else:
            self.starts = list(range(0, max(1, len(ids) - seq_len + 1), stride))

    def __len__(self):
        if self.starts is None:
            # define a large epoch size, sample randomly
            return max(1000, len(self.ids) // self.seq_len)
        return len(self.starts)

    def __getitem__(self, idx):
        if self.starts is None:
            # random crop
            if len(self.ids) <= self.seq_len:
                start = 0
            else:
                start = random.randint(0, len(self.ids) - self.seq_len)
        else:
            start = self.starts[idx]
        window = self.ids[start:start + self.seq_len]
        # pad if needed
        if len(window) < self.seq_len:
            pad_id = tokenizer.token_to_id("[PAD]")
            window = window + [pad_id] * (self.seq_len - len(window))
        x = torch.tensor(window[:-1], dtype=torch.long)  # input tokens (seq_len-1)
        y = torch.tensor(window[1:], dtype=torch.long)   # targets (shifted)
        print("Source  =>>>>>>>>>>>>>", self.tokenizer.decode(x.tolist()))
        print(x.tolist())
        print("Target  =>>>>>>>>>>>>>", self.tokenizer.decode(y.tolist()))
        print(y.tolist())
        return x, y

In [None]:
files = glob(os.path.join("./data", "*.txt"))
files

In [None]:
tokenizer = Tokenizer.from_file("vietnamese_bpe_tokenizer.json")
data = TextDataset(token_files=files, tokenizer=tokenizer, seq_len=SEQ_LEN)
data

In [None]:
data.__getitem__(1)

In [None]:
def collate_batch(batch):
    xs, ys = zip(*batch)
    x = torch.stack(xs, dim=0)
    y = torch.stack(ys, dim=0)
    return x, y

# -------------------------
# Model / Tokenizer loader
# -------------------------
# You must have tokenizer and GPTModel in scope (from previous code).
# Example:
# from tokenizers import Tokenizer
# tokenizer = Tokenizer.from_file("vietnamese_bpe_tokenizer.json")
# VOCAB_SIZE = tokenizer.get_vocab_size()
#
# from model import GPTModel
# model = GPTModel(VOCAB_SIZE, d_model=256, n_heads=4, d_ff=1024, n_layers=4, max_seq_len=SEQ_LEN)
# model.to(DEVICE)

# -------------------------
# Prepare dataset and dataloaders
# -------------------------
def prepare_dataloaders(all_files, tokenizer, seq_len, batch_size, val_split=VALIDATION_SPLIT):
    random.shuffle(all_files)
    n_val = max(1, int(len(all_files) * val_split))
    train_files = all_files[n_val:]
    val_files = all_files[:n_val]

    train_ds = TextDataset(train_files, tokenizer, seq_len)
    val_ds = TextDataset(val_files, tokenizer, seq_len)

    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=collate_batch, num_workers=2)
    val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False, collate_fn=collate_batch, num_workers=2)
    return train_loader, val_loader

# -------------------------
# Training helpers
# -------------------------
def save_checkpoint(state, path):
    torch.save(state, path)

def load_checkpoint(path, model, optimizer=None, scheduler=None):
    ckpt = torch.load(path, map_location=DEVICE)
    model.load_state_dict(ckpt["model_state"])
    if optimizer and "optimizer_state" in ckpt:
        optimizer.load_state_dict(ckpt["optimizer_state"])
    if scheduler and "scheduler_state" in ckpt:
        scheduler.load_state_dict(ckpt["scheduler_state"])
    step = ckpt.get("step", 0)
    return step

# -------------------------
# Training loop
# -------------------------
def train(model, tokenizer, data_dir, seq_len=SEQ_LEN, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS):
    global VOCAB_SIZE
    VOCAB_SIZE = tokenizer.get_vocab_size() if hasattr(tokenizer, "get_vocab_size") else len(tokenizer.get_vocab())
    model = model.to(DEVICE)

    # prepare files
    files = glob(os.path.join(data_dir, "*.txt"))
    if len(files) == 0:
        raise ValueError("No .txt files found in data_dir")

    train_loader, val_loader = prepare_dataloaders(files, tokenizer, seq_len, batch_size)

    # optimizer + scheduler
    optimizer = AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
    # simple linear warmup then decay
    total_steps = epochs * len(train_loader)
    def lr_lambda(step):
        if step < WARMUP_STEPS:
            return float(step) / float(max(1, WARMUP_STEPS))
        return max(0.0, float(total_steps - step) / float(max(1, total_steps - WARMUP_STEPS)))
    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)

    criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.token_to_id("[PAD]"))

    scaler = torch.cuda.amp.GradScaler(enabled=USE_FP16 and DEVICE.startswith("cuda"))

    global_step = 0
    best_val_loss = float("inf")

    for epoch in range(epochs):
        model.train()
        pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}", leave=False)
        running_loss = 0.0
        for batch in pbar:
            x, y = batch  # x: (B, seq_len-1), y: (B, seq_len-1)
            x = x.to(DEVICE)
            y = y.to(DEVICE)

            optimizer.zero_grad()
            with torch.cuda.amp.autocast(enabled=USE_FP16 and DEVICE.startswith("cuda")):
                logits = model(x)  # (B, L, V)
                # flatten for CE
                loss = criterion(logits.view(-1, logits.size(-1)), y.view(-1))

            scaler.scale(loss).backward()
            # gradient clipping
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()

            running_loss += loss.item()
            global_step += 1

            pbar.set_postfix({'loss': f"{running_loss/global_step:.4f}", 'lr': f"{scheduler.get_last_lr()[0]:.2e}"})

            # checkpoint
            if global_step % CHECKPOINT_EVERY == 0:
                ckpt_path = os.path.join(SAVE_DIR, f"ckpt_step{global_step}.pt")
                save_checkpoint({
                    "step": global_step,
                    "model_state": model.state_dict(),
                    "optimizer_state": optimizer.state_dict(),
                    "scheduler_state": scheduler.state_dict(),
                    "tokenizer": None
                }, ckpt_path)
                print("Saved", ckpt_path)

        # end epoch -> validate
        val_loss = evaluate(model, val_loader, criterion, tokenizer)
        print(f"Epoch {epoch+1} finished. Validation loss: {val_loss:.4f}")
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_path = os.path.join(SAVE_DIR, "best_model.pt")
            save_checkpoint({
                "step": global_step,
                "model_state": model.state_dict(),
                "optimizer_state": optimizer.state_dict(),
                "scheduler_state": scheduler.state_dict(),
                "tokenizer": None
            }, best_path)
            print("Saved best model to", best_path)

    print("Training finished. Best val loss:", best_val_loss)

# -------------------------
# Evaluation
# -------------------------
def evaluate(model, val_loader, criterion, tokenizer):
    model.eval()
    total_loss = 0.0
    total_tokens = 0
    with torch.no_grad():
        for batch in val_loader:
            x, y = batch
            x = x.to(DEVICE)
            y = y.to(DEVICE)
            logits = model(x)
            loss = criterion(logits.view(-1, logits.size(-1)), y.view(-1))
            # scale by number of tokens
            total_loss += loss.item() * x.size(0) * x.size(1)
            total_tokens += x.size(0) * x.size(1)
    return total_loss / total_tokens


In [None]:
from tokenizers import Tokenizer
tokenizer = Tokenizer.from_file("vietnamese_bpe_tokenizer.json")
VOCAB_SIZE = tokenizer.get_vocab_size()
print(VOCAB_SIZE)
# import your model class (previous message)
# from model import GPTModel
# model = GPTModel(VOCAB_SIZE, d_model=256, n_heads=4, d_ff=1024, n_layers=4, max_seq_len=SEQ_LEN)
# train(model, tokenizer, data_dir="data", seq_len=SEQ_LEN, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS)

In [None]:
from train import VietnameseTransformer
vocab_size = VOCAB_SIZE
model = VietnameseTransformer(
    vocab_size=vocab_size,
    d_model=512,
    n_heads=8,
    n_layers=6,
    d_ff=2048,
    max_seq_len=128,
    dropout=0.1
)
model

In [None]:
sample_input = "Truyện Kiều được viết"
input_ids = torch.tensor(
    [tokenizer.encode(sample_input, add_special_tokens=False).ids],
    # device="auto"
)
input_ids

In [None]:
model.train()

In [None]:
import torch.nn.functional as F

generated_tokens = input_ids.clone()
logits, loss = model(generated_tokens, return_loss=True)
print(loss)
# Get logits for last position
next_token_logits = logits[:, -1, :] / 1.0
top_k = 10
top_p = 0.9
do_sample = True

# logits, loss = model(generated_tokens)
# print(loss)
# Apply top-k filtering
if top_k > 0:
    top_k = min(top_k, next_token_logits.size(-1))
    top_k_logits, top_k_indices = torch.topk(next_token_logits, top_k)
    next_token_logits = torch.full_like(next_token_logits, -float('inf'))
    next_token_logits.scatter_(-1, top_k_indices, top_k_logits)

# Apply top-p (nucleus) filtering
if top_p < 1.0:
    sorted_logits, sorted_indices = torch.sort(next_token_logits, descending=True)
    cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
    
    # Remove tokens with cumulative probability above the threshold
    sorted_indices_to_remove = cumulative_probs > top_p
    # Keep at least one token
    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
    sorted_indices_to_remove[..., 0] = 0
    
    indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
    next_token_logits[indices_to_remove] = -float('inf')

# Sample or greedy decode
if do_sample:
    probs = F.softmax(next_token_logits, dim=-1)
    next_tokens = torch.multinomial(probs, num_samples=1)
else:
    next_tokens = torch.argmax(next_token_logits, dim=-1, keepdim=True)

generated_tokens = torch.cat([generated_tokens, next_tokens], dim=-1)

In [None]:
generated_tokens

In [None]:
tokenizer.decode([469])