<a href="https://colab.research.google.com/github/Cuongz297/cuong/blob/main/my_chatbot_cleaned.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install transformers datasets peft accelerate trl evaluate bitsandbytes



In [None]:
!pip install fsspec[http]==2025.3.0 datasets==4.0.0 gcsfs==2025.3.0



In [None]:
# Import thư viện cần thiết
import os
import torch
os.environ["WANDB_DISABLED"] = "true"

from datasets import load_dataset

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging
)

from peft import LoraConfig, PeftModel
from trl import SFTTrainer
#from trl.utils import get_peft_config  # Tùy chọn nếu cần dùng


In [None]:
#convert data gốc về format của bkai-foundation-models/vietnamese-llama2-7b-40GB.
from datasets import load_dataset, DatasetDict

# Load local JSON file
transformed_dataset = load_dataset('json', data_files='/all_data.json')

# Split the dataset into training and testing sets
train_test_split = transformed_dataset['train'].train_test_split(test_size=0.1)
transformed_dataset = DatasetDict({
    'train': train_test_split['train'],
    'test': train_test_split['test']
})


# Xem trước dữ liệu
print(transformed_dataset)

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 3006
    })
    test: Dataset({
        features: ['text'],
        num_rows: 334
    })
})


In [None]:
# Cấu hình tên model và nơi lưu
base_model_name = "bkai-foundation-models/vietnamese-llama2-7b-40GB"
finetune_model_name = "my_finetuned_vllama2"
#cấu hình vài tham số
output_dir = "./results"
#số lượng epoch
num_train_epochs = 3
#no change params
# Thông số cho quantization và LoRA
use_4bit = True
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = False

lora_r = 64
lora_alpha = 16
lora_dropout = 0.1

fp16 = False
bf16 = False

#Thông số huấn luyện
per_device_train_batch_size = 4
per_device_eval_batch_size = 4

gradient_accumulation_steps = 4
gradient_checkpointing = True
max_grad_norm = 1.0

learning_rate = 2e-4
weight_decay = 0.001
optim = "paged_adamw_32bit"

lr_scheduler_type = "cosine"
max_steps = -1
warmup_ratio = 0.03

group_by_length = True
save_steps = 0
logging_steps = 25

max_seq_length = None
packing = False
neftune_noise_alpha = {"": 0}

In [None]:
from huggingface_hub import login
from getpass import getpass

hf_token = getpass("Paste your Hugging Face token here: ")
login(token=hf_token)

Paste your Hugging Face token here: ··········


In [None]:
#cài đặt quantization dựa vào tham số ở trên
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
# Thiết lập quantization (4-bit)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",             # nf4 là loại lượng tử tốt hơn int4
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False        # Không dùng nested quant để tiết kiệm RAM
)
# Load model với quantization
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
    device_map="auto",                     # Tự chia GPU/CPU nếu cần
    trust_remote_code=True,
    attn_implementation="eager"
)
# Load tokenizer tương ứng
tokenizer = AutoTokenizer.from_pretrained(
    base_model_name,
    trust_remote_code=True
)

# Thiết lập pad_token để tránh lỗi nếu model không có sẵn
tokenizer.pad_token = tokenizer.eos_token

# Padding bên phải (giữ thống nhất định dạng đầu vào)
tokenizer.padding_side = "right"


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
prompt = "<|user|>\n[Tác vụ: mô tả]\nBạn có thể kể tên một vài loại phân bón?.\n\n<|assistant|>\n"
# Dùng pipeline
pipe = pipeline(task="text-generation", model=base_model, tokenizer=tokenizer)
# Sinh văn bản
result = pipe(prompt, max_new_tokens=200, do_sample=True, temperature=0.8)
#temperature: độ sáng tạo
#max_new_tokens độ dài token
# In kết quả
print(result[0]['generated_text'])

In [None]:
#cấu hình LoRA
#https://discuss.huggingface.co/t/task-type-parameter-of-loraconfig/52879
peft_config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    bias="none",
    task_type="CAUSAL_LM"
)

#set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,# vừa làm ở trên
    num_train_epochs=num_train_epochs,#1

    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,

    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,

    learning_rate=learning_rate,
    weight_decay=weight_decay,

    fp16=fp16,
    bf16=bf16,

    max_grad_norm=max_grad_norm,
    max_steps=max_steps,

    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="none"  # Trong TrainingArguments
)

In [None]:
trainer = SFTTrainer(
    model=base_model,
    train_dataset=transformed_dataset["train"],
    peft_config=peft_config,
    args=training_arguments
)

trainer.train()

Adding EOS to train dataset:   0%|          | 0/3006 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/3006 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/3006 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


OutOfMemoryError: CUDA out of memory. Tried to allocate 86.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 2.12 MiB is free. Process 4508 has 14.70 GiB memory in use. Of the allocated memory 14.25 GiB is allocated by PyTorch, and 318.91 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# Tạo pipeline
pipe = pipeline(
    "text-generation",
    model=base_model,                   # model đã merge
    tokenizer=tokenizer,
    #device=0                       # hoặc "cuda:0" nếu dùng GPU
    device_map="auto"
)

# Test thử
output = pipe("<|user|>\n[Tác vụ: trả lời câu hỏi]\ncây cao su có những bệnh gì?\n\n<|assistant|>\n", max_new_tokens=50, temperature=0.5, do_sample=True)
print(output[0]['generated_text'])

Device set to use cuda:0


<|user|>
[Tác vụ: trả lời câu hỏi]
cây cao su có những bệnh gì?

<|assistant|>
 Unterscheidung zwischen dem Schweren und dem Lichten.


In [None]:
#save trained model:
trainer.model.save_pretrained(finetune_model_name)

In [None]:
# # Tạo thư mục offload nếu chưa có
# import os
# os.makedirs("./offload", exist_ok=True)

# # Config cho mô hình 4bit
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_compute_dtype=torch.float16,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_use_double_quant=False
# )
#merge model gốc và LoRA
for var in ["base_model", "pipeline", "trainer", "model"]:
    if var in globals():
        del globals()[var]
#reload and merge
device_map = "auto"
# Load lại base model
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    quantization_config=bnb_config,
    device_map=device_map,
    offload_folder="./offload"  # <== thêm dòng này
)

# Nạp LoRA đã fine-tuned vào base model
model = PeftModel.from_pretrained(
    base_model,
    finetune_model_name,
    device_map="auto",
    offload_folder="./offload"
)
model = model.merge_and_unload()  # Merge adapter vào base

# Reload tokenizer để lưu lại
tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [None]:
#save lại model
# Lưu mô hình đã fine-tune
model.save_pretrained("models/finetuned_model")

# Lưu tokenizer tương ứng
tokenizer.save_pretrained("models/tokenizer")


('models/tokenizer/tokenizer_config.json',
 'models/tokenizer/special_tokens_map.json',
 'models/tokenizer/tokenizer.json')

In [None]:
!zip -r /content/models.zip models/

  adding: models/ (stored 0%)
  adding: models/finetuned_model/ (stored 0%)
  adding: models/finetuned_model/generation_config.json (deflated 35%)
  adding: models/finetuned_model/config.json (deflated 56%)
  adding: models/finetuned_model/model.safetensors (deflated 10%)
  adding: models/tokenizer/ (stored 0%)
  adding: models/tokenizer/special_tokens_map.json (deflated 72%)
  adding: models/tokenizer/tokenizer_config.json (deflated 68%)
  adding: models/tokenizer/tokenizer.json (deflated 85%)


In [None]:
from google.colab import files
files.download('/content/models.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>