In [1]:
import os, sys, types, torch, wandb
from datasets import load_dataset, Dataset
import json
from transformers import (
    AutoModelForCausalLM,
    BloomTokenizerFast,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
)
from peft import LoraConfig, get_peft_model

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU name:", torch.cuda.get_device_name(0))



Torch version: 2.1.0+cu118
CUDA available: True
GPU name: NVIDIA GeForce RTX 4060 Ti


In [None]:
sys.modules["bitsandbytes"] = type(sys)("bitsandbytes")

In [None]:
WANDB_PROJECT = "PhoGPT-Tiki"
WANDB_NAME = "XXXX"
WANDB_API_KEY = "XXXX"
MODEL_NAME = "vinai/PhoGPT-4B"
DATA_PATH = "tiki_dataset_processed.jsonl"


In [None]:
os.environ["WANDB_API_KEY"] = WANDB_API_KEY
wandb.login()
wandb.init(project=WANDB_PROJECT, name=WANDB_NAME)


In [None]:
from datasets import Dataset
import json

DATA_PATH = r"D:\PHOGPT\tiki_dataset_processed.jsonl"

data = []
with open(DATA_PATH, "r", encoding="utf-8") as f:
    for line in f:
        data.append(json.loads(line))

dataset = Dataset.from_list(data)

dataset = dataset.train_test_split(test_size=0.1, seed=42)

print("Loaded dataset successfully!")
print("Training samples:", len(dataset["train"]))
print("Testing samples:", len(dataset["test"]))
print("Mẫu đầu tiên:", dataset["train"][0])


In [None]:
tokenizer = BloomTokenizerFast.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Tokenizer loaded | pad_token:", tokenizer.pad_token)


In [10]:
for name, module in model.named_modules():
    if any(k in name.lower() for k in ["attn", "proj", "wq", "wv", "ffn", "linear"]):
        print(name)



transformer.blocks.0.attn
transformer.blocks.0.attn.Wqkv
transformer.blocks.0.attn.out_proj
transformer.blocks.0.ffn
transformer.blocks.0.ffn.up_proj
transformer.blocks.0.ffn.down_proj
transformer.blocks.0.resid_attn_dropout
transformer.blocks.0.resid_ffn_dropout
transformer.blocks.1.attn
transformer.blocks.1.attn.Wqkv
transformer.blocks.1.attn.out_proj
transformer.blocks.1.ffn
transformer.blocks.1.ffn.up_proj
transformer.blocks.1.ffn.down_proj
transformer.blocks.1.resid_attn_dropout
transformer.blocks.1.resid_ffn_dropout
transformer.blocks.2.attn
transformer.blocks.2.attn.Wqkv
transformer.blocks.2.attn.out_proj
transformer.blocks.2.ffn
transformer.blocks.2.ffn.up_proj
transformer.blocks.2.ffn.down_proj
transformer.blocks.2.resid_attn_dropout
transformer.blocks.2.resid_ffn_dropout
transformer.blocks.3.attn
transformer.blocks.3.attn.Wqkv
transformer.blocks.3.attn.out_proj
transformer.blocks.3.ffn
transformer.blocks.3.ffn.up_proj
transformer.blocks.3.ffn.down_proj
transformer.blocks.3.re

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True,
    low_cpu_mem_usage=True
)
print("PhoGPT-4B loaded successfully")

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["Wqkv", "out_proj"],  
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.


PhoGPT-4B loaded successfully
trainable params: 4,718,592 || all params: 3,692,795,904 || trainable%: 0.1277783046414471


In [12]:
# --- Tokenization ---
def tokenize_function(example):
    return tokenizer(
        example["text"],
        truncation=True,
        max_length=512
    )

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=dataset["train"].column_names)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


Map: 100%|██████████| 8152/8152 [00:00<00:00, 23741.90 examples/s]
Map: 100%|██████████| 906/906 [00:00<00:00, 25495.41 examples/s]


In [None]:
training_args = TrainingArguments(
    output_dir="./PhoGPT-Tiki-Finetuned",
    evaluation_strategy="steps",
    eval_steps=1000,                 
    logging_steps=200,               
    learning_rate=3e-5,               
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=3,                
    weight_decay=0.01,
    fp16=True,
    save_strategy="steps",
    save_steps=1000,
    report_to="wandb",
    gradient_accumulation_steps=2,    
    warmup_ratio=0.05,                 
    gradient_checkpointing=False,      
    optim="adamw_torch"
)




In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator
)

In [None]:
trainer.train()

trainer.save_model("./PhoGPT-Tiki-Finetuned")
tokenizer.save_pretrained("./PhoGPT-Tiki-Finetuned")
wandb.finish()
