In [1]:
import sys

print(sys.version)

3.12.9 | packaged by conda-forge | (main, Mar  4 2025, 22:37:18) [MSC v.1943 64 bit (AMD64)]


In [2]:
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
    DataCollatorForSeq2Seq,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer, SFTConfig
import os

print("Number of GPU: ", torch.cuda.device_count())
print("GPU Name: ", torch.cuda.get_device_name())


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

  from .autonotebook import tqdm as notebook_tqdm


Number of GPU:  1
GPU Name:  NVIDIA GeForce RTX 3070 Laptop GPU
Using device: cuda


In [3]:
# --- 1. Configure Model, Tokenizer, and Quantization (QLoRA) ---

model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

# Path or name of your dataset. The dataset should have a column 'text' formatted with LLaMA chat template
# Example:
# "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat is the capital of France?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nThe capital of France is Paris.<|eot_id|>"
dataset_name = "sample_dataset.jsonl"  # <<< CHANGE THIS
output_dir = "./results/llama3-8b-finetuned-results"

In [4]:
# QLoRA quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
)
model.config.use_cache = False
model.config.pretraining_tp = 1


Loading checkpoint shards: 100%|██████████| 4/4 [00:19<00:00,  4.79s/it]


In [5]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [6]:
# --- 2. Prepare Model for QLoRA Training ---
model = prepare_model_for_kbit_training(model)

# --- 3. LoRA Configuration ---
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
)
# Apply PEFT with LoRA
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()



# --- 4. Load and Prepare Dataset ---
try:
    dataset = load_dataset("json", data_files={"train": "sample_dataset.jsonl"}, split="train")
    dataset = dataset.shuffle(seed=42)
    split_dataset = dataset.train_test_split(test_size=0.2, seed=42)
    train_dataset = split_dataset["train"]
    eval_dataset = split_dataset["test"]
except:
    dataset = load_dataset("json", data_files={"train": dataset_name}, split="train")

trainable params: 167,772,160 || all params: 8,198,033,408 || trainable%: 2.0465


In [7]:
# --- 5. Training Arguments Configuration ---
sft_config = SFTConfig(
    output_dir=output_dir,
    bf16=True,
    fp16=False,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    optim="paged_adamw_8bit",
    save_steps=100,
    logging_steps=10,
    learning_rate=2e-4,
    weight_decay=0.001,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
    report_to="tensorboard",
    dataset_text_field="text",
    max_length=1024,
    packing=False,
    num_train_epochs=4,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Vì đây là causal LM (không dùng masked LM như BERT)
)

from transformers.trainer_callback import TrainerCallback

class MemoryClearCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, **kwargs):
        torch.cuda.empty_cache()


# --- 6. Initialize SFTTrainer ---
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=peft_config,
    args=sft_config,
    data_collator=data_collator
)
trainer.add_callback(MemoryClearCallback())
# --- 7. Start Training ---
print("Starting training...")
trainer.train()
print("Training finished.")

# --- 8. Save Adapter and Tokenizer ---
print(f"Saving fine-tuned adapter model to {output_dir}")
trainer.model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print("Adapter and tokenizer saved.")

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Starting training...


  return fn(*args, **kwargs)
The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.bfloat16.


Epoch,Training Loss,Validation Loss
1,2.8459,1.241946
2,1.2219,0.919839
3,0.5427,0.805013


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


Training finished.
Saving fine-tuned adapter model to ./results/llama3-8b-finetuned-results
Adapter and tokenizer saved.


In [8]:
# --- Optional: Clean Up Memory ---
# del model
# del trainer
# import gc
# gc.collect()
# torch.cuda.empty_cache()

In [9]:
# --- 9. Inference with Fine-tuned Model ---
print("\n--- Testing Inference with Fine-tuned Model ---")
logging.set_verbosity(logging.CRITICAL)
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)


--- Testing Inference with Fine-tuned Model ---


In [10]:
# Test prompt
user_input_test = "Giới thiệu nội dung triết lý trong tác phẩm 'Chiếc thuyền ngoài xa'."
system_content_test = "You are an expert in analyzing and interpreting philosophical aspects in literature, with a focus on Vietnamese literary works. **Provide your response in Vietnamese.**"

messages_test = [
    {"role": "system", "content": system_content_test},
    {"role": "user", "content": user_input_test},
]

prompt_test = tokenizer.apply_chat_template(messages_test, tokenize=False, add_generation_prompt=True)
result = pipe(prompt_test)

generated_full = result[0]['generated_text']
answer = generated_full[len(prompt_test):].strip()

print("\n--- Inference Result ---")
print(answer)
print("----------------------")


--- Inference Result ---
Tác phẩm 'Chiếc thuyền ngoài xa' là một tác phẩm tiêu biểu của văn học Việt Nam thời kỳ đổi mới, mang đậm tính triết lý nhân sinh. Nó đặt ra vấn đề về sự đối lập giữa nghệ thuật và hiện thực, cũng như cái nhìn đa chiều về con người và cuộc sống. Qua đó, tác phẩm gợi mở một số thought về cái đẹp, sự thật và trách nhiệm của con người. Tóm lại, 'Chiếc thuyền ngoài xa' là một tác phẩm có giá trị triết lý sâu sắc, góp phần vào sự phát triển của văn học Việt Nam thời kỳ đổi mới.
----------------------
