In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer, SFTConfig
from datasets import load_dataset

# 모델 이름
model_name = "kormo-lm/KORMo-IFT-step-6000"

# 4bit 양자화 설정
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# 모델 로드 (4bit 양자화)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

# LoRA 설정
peft_config = LoraConfig(
    r=128,
    lora_alpha=256,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules="all-linear"
)

dataset = load_dataset("trl-lib/Capybara", split="train[:1000]")

In [None]:
# 학습 인자 설정
training_args = SFTConfig(
    output_dir="./qwen3-4b-qlora",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    optim="adamw_bnb_8bit",
    logging_steps=1,
    save_strategy="epoch",
    learning_rate=2e-4,
    bf16=True,
    tf32=True,
    warmup_ratio=0.01,
    lr_scheduler_type="cosine",
    packing=True,
    max_length=1024,
)

# SFT Trainer 초기화
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    args=training_args,

)

# 학습 시작
print("Starting training...")
trainer.train()

# 모델 저장
trainer.model.save_pretrained("./qwen3-4b-qlora-final")