# Qwen3: Supervised Fine-Tuning with TRL

This notebook demonstrates how to fine-tune a language model using the Supervised Fine-Tuning (SFT) approach with the TRL library.

## Install required libraries

In [None]:
!pip install -U transformers torch torchaudio torchvision trl peft bitsandbytes accelerate datasets

Collecting torch
  Downloading torch-2.7.1-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (29 kB)
Collecting torchaudio
  Downloading torchaudio-2.7.1-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (6.6 kB)
Collecting torchvision
  Downloading torchvision-0.22.1-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (6.1 kB)
Collecting trl
  Downloading trl-0.18.1-py3-none-any.whl.metadata (11 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting sympy>=1.13.3 (from torch)
  Downloading sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.6.77 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.6.77-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.6.77 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.6.77-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.m

In [None]:
!wandb login

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mwoosh98[0m ([33mwoosh98-inha-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin



## Import libraries

In [None]:
import torch
from datasets import load_from_disk
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from trl import DPOConfig, DPOTrainer
from peft import LoraConfig

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# prompt: /content/prepared_finetuning_dataset_splits_arrow_dpo.zip Ïù¥ ÏïïÏ∂ïÌååÏùºÏùÑ Ìë∏Îäî ÏΩîÎìúÎ•º ÏûëÏÑ±Ìï¥ Ï§ò

!unzip /content/prepared_finetuning_dataset_splits_arrow_dpo.zip -d /content/

Archive:  /content/prepared_finetuning_dataset_splits_arrow_dpo.zip
   creating: /content/prepared_finetuning_dataset_splits_arrow_dpo/
  inflating: /content/prepared_finetuning_dataset_splits_arrow_dpo/dataset_dict.json  
   creating: /content/prepared_finetuning_dataset_splits_arrow_dpo/test/
  inflating: /content/prepared_finetuning_dataset_splits_arrow_dpo/test/data-00000-of-00001.arrow  
  inflating: /content/prepared_finetuning_dataset_splits_arrow_dpo/test/dataset_info.json  
  inflating: /content/prepared_finetuning_dataset_splits_arrow_dpo/test/state.json  
   creating: /content/prepared_finetuning_dataset_splits_arrow_dpo/train/
  inflating: /content/prepared_finetuning_dataset_splits_arrow_dpo/train/data-00000-of-00001.arrow  
  inflating: /content/prepared_finetuning_dataset_splits_arrow_dpo/train/dataset_info.json  
  inflating: /content/prepared_finetuning_dataset_splits_arrow_dpo/train/state.json  
   creating: /content/prepared_finetuning_dataset_splits_arrow_dpo/valida

# Load Dataset

In [None]:
dataset_path = "prepared_finetuning_dataset_splits_arrow_dpo"

# Arrow ÌòïÏãùÏùò Îç∞Ïù¥ÌÑ∞ÏÖã Î°úÎìú
dataset = load_from_disk(dataset_path)

train_dataset = dataset["train"]
eval_dataset = dataset["validation"]

print("Îç∞Ïù¥ÌÑ∞ÏÖã Î°úÎìú ÏôÑÎ£å!")
print(f"ÌïôÏäµ Îç∞Ïù¥ÌÑ∞ Í∞úÏàò: {len(train_dataset)}")
print(f"Í≤ÄÏ¶ù Îç∞Ïù¥ÌÑ∞ Í∞úÏàò: {len(eval_dataset)}")

Îç∞Ïù¥ÌÑ∞ÏÖã Î°úÎìú ÏôÑÎ£å!
ÌïôÏäµ Îç∞Ïù¥ÌÑ∞ Í∞úÏàò: 610
Í≤ÄÏ¶ù Îç∞Ïù¥ÌÑ∞ Í∞úÏàò: 305


## Configuration

3. Î™®Îç∏ Î∞è ÌÜ†ÌÅ¨ÎÇòÏù¥Ï†Ä ÏÑ§Ï†ï (QLoRA Ï†ÅÏö©)
A100 GPUÏùò Î©îÎ™®Î¶¨Î•º Ìö®Ïú®Ï†ÅÏúºÎ°ú ÏÇ¨Ïö©ÌïòÍ∏∞ ÏúÑÌï¥, 4ÎπÑÌä∏ ÏñëÏûêÌôî(Quantization)ÏôÄ LoRA(Low-Rank Adaptation)Î•º Ìï®Íªò ÏÇ¨Ïö©ÌïòÎäî QLoRA Î∞©ÏãùÏúºÎ°ú Î™®Îç∏ÏùÑ Î°úÎìúÌï©ÎãàÎã§.

In [None]:
# Model configuration
model_name = "Qwen/Qwen3-4B"  # You can change this to any model you want to fine-tune

# LoRA ÏÑ§Ï†ï (PEFT)
# Í≤ÄÏ¶ù Í≤∞Í≥º, ÏïÑÎûòÏùò target_modulesÍ∞Ä Qwen2 ÏïÑÌÇ§ÌÖçÏ≤òÏóê Í∞ÄÏû• Ï†ÅÌï©Ìï©ÎãàÎã§.
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    task_type="CAUSAL_LM",
)

# 4ÎπÑÌä∏ QLoRA ÏñëÏûêÌôî ÏÑ§Ï†ï
# A100 GPUÎäî bfloat16ÏùÑ ÏßÄÏõêÌïòÎØÄÎ°ú, Ïù¥Î•º compute dtypeÏúºÎ°ú ÏÑ§Ï†ïÌïòÎäî Í≤ÉÏù¥ Í∞ÄÏû• Ìö®Ïú®Ï†ÅÏûÖÎãàÎã§.
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# Î™®Îç∏ Î°úÎî©
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

print("Í≤ÄÏ¶ùÎêú ÏÑ§Ï†ïÏúºÎ°ú Î™®Îç∏Í≥º ÌÜ†ÌÅ¨ÎÇòÏù¥Ï†Ä Î°úÎî© ÏôÑÎ£å!")

# Training configuration
use_peft = True  # Set to True to use Parameter-Efficient Fine-Tuning (PEFT)
output_dir = "./output/sft-model"
num_train_epochs = 1
per_device_train_batch_size = 1
gradient_accumulation_steps = 4
learning_rate = 2e-4 if use_peft else 2e-5  # Higher learning rate for PEFT

config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/32.8k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/99.6M [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Í≤ÄÏ¶ùÎêú ÏÑ§Ï†ïÏúºÎ°ú Î™®Îç∏Í≥º ÌÜ†ÌÅ¨ÎÇòÏù¥Ï†Ä Î°úÎî© ÏôÑÎ£å!


## Load model and tokenizer

In [None]:
# Load model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    use_cache=False,  # Disable KV cache during training
    device_map="auto"
)

# ‚Üì Ïó¨Í∏∞Ïóê SFT Ïñ¥ÎåëÌÑ∞ Î°úÎìú/Ï†ÅÏö© Î∂ÄÎ∂ÑÏùÑ Ï∂îÍ∞Ä ‚Üì
from peft import PeftModel

sft_model = PeftModel.from_pretrained(
    model,
    "totalcream/Qwen3-4B-finetunning-Adapter",  # Hugging FaceÏóê Ïò¨Î†§Îëî SFT Ïñ¥ÎåëÌÑ∞ repo ID
    device_map="auto",
    is_trainable=True,                       # DPO Îã®Í≥ÑÏóêÏÑú ÏóÖÎç∞Ïù¥Ìä∏ÌïòÎ†§Î©¥ True
)
model = sft_model  # Ïù¥ÌõÑ DPOTrainerÏóêÎäî Ïù¥ modelÏùÑ ÎÑòÍπÅÎãàÎã§

# (ÏõêÎûò ÏûàÎçò) ÌÜ†ÌÅ¨ÎÇòÏù¥Ï†Ä Î°úÎìú
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# # Set up chat formatting (if the model doesn't have a chat template)
# if tokenizer.chat_template is None:
#     model, tokenizer = setup_chat_format(model, tokenizer, format="chatml")

# # Set padding token
# if tokenizer.pad_token is None:
#     tokenizer.pad_token = tokenizer.eos_token

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

adapter_config.json:   0%|          | 0.00/845 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/66.1M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/9.73k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

In [None]:
# DPO ÌïôÏäµÏùÑ ÏúÑÌïú ÏµúÏ¢Ö ÌïòÏù¥ÌçºÌååÎùºÎØ∏ÌÑ∞ ÏÑ§Ï†ï
training_args = DPOConfig(
    output_dir="./qwen3-dpo-final-results", # Í≤∞Í≥º Ï†ÄÏû• ÎîîÎ†âÌÜ†Î¶¨
    num_train_epochs=2,                   # Îç∞Ïù¥ÌÑ∞ÏÖãÏù¥ 610Í∞úÏù¥ÎØÄÎ°ú 2~3 ÏóêÌè¨ÌÅ¨Í∞Ä Ï†ÅÎãπÌï©ÎãàÎã§.
    per_device_train_batch_size=2,        # A100 Î©îÎ™®Î¶¨Î•º Í≥†Î†§Ìïú ÏïàÏ†ïÏ†ÅÏù∏ Î∞∞Ïπò ÏÇ¨Ïù¥Ï¶à
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,        # Ïú†Ìö® Î∞∞Ïπò ÏÇ¨Ïù¥Ï¶àÎ•º 16 (2*8)ÏúºÎ°ú ÏÑ§Ï†ïÌïòÏó¨ ÌïôÏäµ ÏïàÏ†ïÌôî
    gradient_checkpointing=True,          # Î©îÎ™®Î¶¨ ÏÇ¨Ïö©Îüâ ÏµúÏ†ÅÌôî
    optim="paged_adamw_8bit",             # Î©îÎ™®Î¶¨ Ìö®Ïú®Ï†Å ÏòµÌã∞ÎßàÏù¥Ï†Ä

    logging_steps=10,                     # 10 Ïä§ÌÖùÎßàÎã§ ÌïôÏäµ Î°úÍ∑∏ Ï∂úÎ†•
    save_steps=50,                        # 50 Ïä§ÌÖùÎßàÎã§ Î™®Îç∏ Ïñ¥ÎåëÌÑ∞ Ï†ÄÏû•
    eval_steps=50,                        # 50 Ïä§ÌÖùÎßàÎã§ Í≤ÄÏ¶ù ÌèâÍ∞Ä Ïã§Ìñâ

    learning_rate=5e-7,                   # DPOÏùò ÌïµÏã¨: Í≤ÄÏ¶ùÎêú ÏïàÏ†ïÏ†ÅÏù∏ ÎÇÆÏùÄ ÌïôÏäµÎ•†
    beta=0.1,                             # DPO ÏÜêÏã§ Í∞ïÎèÑ, 0.1Ïù¥ ÌëúÏ§ÄÏ†ÅÏù∏ ÏãúÏûëÏ†ê

    lr_scheduler_type="linear",           # ÏÑ†Ìòï ÌïôÏäµÎ•† Ïä§ÏºÄÏ§ÑÎü¨
    warmup_ratio=0.1,                     # ÌïôÏäµ Ï¥àÎ∞òÏóê learning rateÎ•º ÏÑúÏÑúÌûà Ï¶ùÍ∞ÄÏãúÏºú ÏïàÏ†ïÏÑ± ÌôïÎ≥¥

    report_to="wandb",                    # Weights & Biases Î°úÍπÖ ÌôúÏÑ±Ìôî
    run_name="qwen3-4b-dpo-custom-data",  # WandBÏóê ÌëúÏãúÎê† Ïã§Ìñâ Ïù¥Î¶Ñ
)

print("Í≤ÄÏ¶ùÎêú ÌïòÏù¥ÌçºÌååÎùºÎØ∏ÌÑ∞Î°ú DPO ÌïôÏäµ ÏÑ§Ï†ï ÏôÑÎ£å!")

Í≤ÄÏ¶ùÎêú ÌïòÏù¥ÌçºÌååÎùºÎØ∏ÌÑ∞Î°ú DPO ÌïôÏäµ ÏÑ§Ï†ï ÏôÑÎ£å!


In [None]:
# DPOTrainer Ï¥àÍ∏∞Ìôî
# DPOTrainerÎäî tokenizer Í∞ùÏ≤¥Î•º 'processing_class'ÎùºÎäî Ïù¥Î¶ÑÏùò Ïù∏ÏûêÎ°ú Î∞õÏäµÎãàÎã§.
trainer = DPOTrainer(
    model=model,
    ref_model=None,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=peft_config,
    processing_class=tokenizer  # <-- Ïù¥ Î∂ÄÎ∂ÑÏù¥ ÏàòÏ†ïÎêòÏóàÏäµÎãàÎã§!
)

# DPO ÌïôÏäµ ÏãúÏûë
print("Qwen3 Î™®Îç∏ DPO ÌååÏù∏ÌäúÎãùÏùÑ ÏãúÏûëÌï©ÎãàÎã§...")
trainer.train()
print("DPO ÌïôÏäµÏù¥ ÏÑ±Í≥µÏ†ÅÏúºÎ°ú ÏôÑÎ£åÎêòÏóàÏäµÎãàÎã§!")



Extracting prompt in train dataset:   0%|          | 0/610 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/610 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/610 [00:00<?, ? examples/s]

Extracting prompt in eval dataset:   0%|          | 0/305 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/305 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/305 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Qwen3 Î™®Îç∏ DPO ÌååÏù∏ÌäúÎãùÏùÑ ÏãúÏûëÌï©ÎãàÎã§...


[34m[1mwandb[0m: Currently logged in as: [33mwoosh98[0m ([33mwoosh98-inha-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss
10,0.6996
20,0.6969
30,0.7129
40,0.6586
50,0.6878
60,0.7041
70,0.6759




DPO ÌïôÏäµÏù¥ ÏÑ±Í≥µÏ†ÅÏúºÎ°ú ÏôÑÎ£åÎêòÏóàÏäµÎãàÎã§!


## Save the fine-tuned model

In [None]:
# ÏµúÏ¢Ö Î™®Îç∏(LoRA Ïñ¥ÎåëÌÑ∞) Ï†ÄÏû•
final_adapter_path = "./qwen3-dpo-final-adapter"
trainer.save_model(final_adapter_path)

print(f"ÌïôÏäµÎêú LoRA Ïñ¥ÎåëÌÑ∞Í∞Ä '{final_adapter_path}' Í≤ΩÎ°úÏóê Ï†ÄÏû•ÎêòÏóàÏäµÎãàÎã§.")
print("\n--- ÌïôÏäµÎêú Î™®Îç∏ ÌÖåÏä§Ìä∏ Î∞©Î≤ï ÏòàÏãú ---")
print("1. ÏõêÎ≥∏ Qwen2 Î™®Îç∏ÏùÑ Î°úÎìúÌï©ÎãàÎã§.")
print("2. PeftModel.from_pretrained()Î•º ÏÇ¨Ïö©Ìï¥ Ï†ÄÏû•Îêú Ïñ¥ÎåëÌÑ∞Î•º Ï†ÅÏö©Ìï©ÎãàÎã§.")
print("3. Î≥ëÌï©Îêú Î™®Îç∏Î°ú Ï∂îÎ°†ÏùÑ ÏàòÌñâÌï©ÎãàÎã§.")

# ÏïÑÎûòÎäî ÎÇòÏ§ëÏóê Î™®Îç∏ÏùÑ Î∂àÎü¨ÏôÄ ÌÖåÏä§Ìä∏Ìï† Îïå ÏÇ¨Ïö©Ìï† Ïàò ÏûàÎäî ÏòàÏãú ÏΩîÎìúÏûÖÎãàÎã§.
# from peft import PeftModel
#
# # 1. ÏõêÎ≥∏ Î™®Îç∏ Îã§Ïãú Î°úÎìú (bfloat16ÏúºÎ°ú)
# base_model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     torch_dtype=torch.bfloat16,
#     device_map="auto",
# )
#
# # 2. ÌïôÏäµÎêú LoRA Ïñ¥ÎåëÌÑ∞ÏôÄ Î≥ëÌï©
# tuned_model = PeftModel.from_pretrained(base_model, final_adapter_path)
#
# # 3. Ï∂îÎ°† ÏàòÌñâ
# prompt = "Ïó¨Í∏∞Ïóê ÌÖåÏä§Ìä∏Ìï† ÌîÑÎ°¨ÌîÑÌä∏Î•º ÏûÖÎ†•ÌïòÏÑ∏Ïöî."
# inputs = tokenizer(prompt, return_tensors="pt").to(tuned_model.device)
# outputs = tuned_model.generate(**inputs, max_new_tokens=100)
# print(tokenizer.decode(outputs[0], skip_special_tokens=True))

ÌïôÏäµÎêú LoRA Ïñ¥ÎåëÌÑ∞Í∞Ä './qwen3-dpo-final-adapter' Í≤ΩÎ°úÏóê Ï†ÄÏû•ÎêòÏóàÏäµÎãàÎã§.

--- ÌïôÏäµÎêú Î™®Îç∏ ÌÖåÏä§Ìä∏ Î∞©Î≤ï ÏòàÏãú ---
1. ÏõêÎ≥∏ Qwen2 Î™®Îç∏ÏùÑ Î°úÎìúÌï©ÎãàÎã§.
2. PeftModel.from_pretrained()Î•º ÏÇ¨Ïö©Ìï¥ Ï†ÄÏû•Îêú Ïñ¥ÎåëÌÑ∞Î•º Ï†ÅÏö©Ìï©ÎãàÎã§.
3. Î≥ëÌï©Îêú Î™®Îç∏Î°ú Ï∂îÎ°†ÏùÑ ÏàòÌñâÌï©ÎãàÎã§.


In [None]:
# prompt: SFT Ïñ¥ÎéÅÌÑ∞Î•º Ï†ÄÏû•ÌïòÎäî ÏΩîÎìúÎ•º ÏûëÏÑ±Ìï¥ Ï§ò

# Construct the save path in Google Drive
drive_save_path = "output/Adapter"
# Ensure the directory exists
os.makedirs(drive_save_path, exist_ok=True)
# Save the adapter model
trainer.model.save_pretrained(drive_save_path)
print(f"Adapter model saved to: {drive_save_path}")

Adapter model saved to: output/Adapter


In [None]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)
from peft import PeftModel
from huggingface_hub import HfApi, login
from google.colab import userdata
import os

# --- 0) bitsandbytes ÏÑ§Ïπò (ÌïúÎ≤àÎßå) ---
# !pip install bitsandbytes

# --- ÏÑ§Ï†ï ---
hf_token = userdata.get('HF_TOKEN')
login(token=hf_token)
api = HfApi()

model_name = "Qwen/Qwen3-4B"
sft_adapter_repo = "totalcream/Qwen3-4B-finetunning-Adapter"
final_dpo_adapter_path = "./qwen3-dpo-final-adapter"
hf_repo_name_merged = "totalcream/Qwen3-4B-SFT-DPO-Merged"
local_save_path = "./qwen3-sft-dpo-merged-final"

# --- 1) 4-bit ÏñëÏûêÌôî ÏÑ§Ï†ï ---
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,              # 4bit Î°úÎìú
    bnb_4bit_quant_type="nf4",      # NF4 ÏñëÏûêÌôî
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

# --- 2) Base Î™®Îç∏ Î°úÎìú (4-bit) ---
print("1) 4-bit ÏñëÏûêÌôîÎêú Î≤†Ïù¥Ïä§ Î™®Îç∏ Î°úÎìú")
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
)

# --- 3) SFT Ïñ¥ÎåëÌÑ∞ Ï†ÅÏö© ‚Üí Î≥ëÌï© ---
print("2) SFT Ïñ¥ÎåëÌÑ∞ Ï†ÅÏö© Î∞è Î≥ëÌï©")
sft_model = PeftModel.from_pretrained(base_model, sft_adapter_repo)
sft_merged = sft_model.merge_and_unload()  # ‚Üí ÏàúÏàò AutoModelForCausalLM (4-bit ÏÉÅÌÉú Ïú†ÏßÄ)

# --- 4) DPO Ïñ¥ÎåëÌÑ∞ Ï†ÅÏö© ‚Üí Î≥ëÌï© ---
print("3) DPO Ïñ¥ÎåëÌÑ∞ Ï†ÅÏö© Î∞è Î≥ëÌï©")
dpo_model = PeftModel.from_pretrained(sft_merged, final_dpo_adapter_path)
final_model = dpo_model.merge_and_unload()  # ‚Üí ÏàúÏàò AutoModelForCausalLM (Ïó≠Ïãú 4-bit)

# --- 5) ÏµúÏ¢Ö Î™®Îç∏ Ï†ÄÏû• (.safetensors) ---
print(f"4) ÏµúÏ¢Ö Î™®Îç∏ Ï†ÄÏû•: {local_save_path}")
os.makedirs(local_save_path, exist_ok=True)
final_model.save_pretrained(
    local_save_path,
    safe_serialization=True  # ‚Üí model.safetensors ÏÉùÏÑ±
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.save_pretrained(local_save_path)

# --- 6) ÌóàÎ∏åÏóê ÏóÖÎ°úÎìú ---
print(f"5) ÌóàÎ∏åÏóê ÏóÖÎ°úÎìú: {hf_repo_name_merged}")
api.create_repo(hf_repo_name_merged, repo_type="model", exist_ok=True)
api.upload_folder(
    folder_path=local_save_path,
    repo_id=hf_repo_name_merged,
    repo_type="model",
    commit_message="Upload 4bit-quantized SFT+DPO merged model"
)

print("üéâ ÏôÑÎ£å! ÌóàÎ∏åÏóêÏÑú ÏñëÏûêÌôîÎêú model.safetensors ÌôïÏù∏ÌïòÏÑ∏Ïöî.")


1) 4-bit ÏñëÏûêÌôîÎêú Î≤†Ïù¥Ïä§ Î™®Îç∏ Î°úÎìú


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

2) SFT Ïñ¥ÎåëÌÑ∞ Ï†ÅÏö© Î∞è Î≥ëÌï©




3) DPO Ïñ¥ÎåëÌÑ∞ Ï†ÅÏö© Î∞è Î≥ëÌï©
4) ÏµúÏ¢Ö Î™®Îç∏ Ï†ÄÏû•: ./qwen3-sft-dpo-merged-final
5) ÌóàÎ∏åÏóê ÏóÖÎ°úÎìú: totalcream/Qwen3-4B-SFT-DPO-Merged


Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/40.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.65G [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

üéâ ÏôÑÎ£å! ÌóàÎ∏åÏóêÏÑú ÏñëÏûêÌôîÎêú model.safetensors ÌôïÏù∏ÌïòÏÑ∏Ïöî.


In [None]:
model.push_to_hub("totalcream/Qwen3-4B-DPO-model")

adapter_model.safetensors:   0%|          | 0.00/66.1M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/totalcream/Qwen3-4B-DPO-model/commit/aad33aa7dbbf9c0c1f31a52ef69db5157fd35297', commit_message='Upload model', commit_description='', oid='aad33aa7dbbf9c0c1f31a52ef69db5157fd35297', pr_url=None, repo_url=RepoUrl('https://huggingface.co/totalcream/Qwen3-4B-DPO-model', endpoint='https://huggingface.co', repo_type='model', repo_id='totalcream/Qwen3-4B-DPO-model'), pr_revision=None, pr_num=None)

## Test the fine-tuned model

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

# --- Î™®Îç∏ Î∞è ÌÜ†ÌÅ¨ÎÇòÏù¥Ï†Ä Í≤ΩÎ°ú ÏÑ§Ï†ï ---
model_name = "Qwen/Qwen3-4B"
adapter_path = "./qwen3-dpo-final-adapter" # DPO ÌïôÏäµ ÌõÑ Ï†ÄÏû•Ìïú Ïñ¥ÎåëÌÑ∞ Í≤ΩÎ°ú

# --- ÌÜ†ÌÅ¨ÎÇòÏù¥Ï†Ä Î°úÎî© ---
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# --- Î™®Îç∏ Î°úÎî© ---
# 1. ÏõêÎ≥∏ Î≤†Ïù¥Ïä§ Î™®Îç∏ Î°úÎî©
original_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

# 2. DPO ÌäúÎãùÎêú Î™®Îç∏ Î°úÎî© (Î≤†Ïù¥Ïä§ Î™®Îç∏ + LoRA Ïñ¥ÎåëÌÑ∞)
tuned_model = PeftModel.from_pretrained(original_model, adapter_path)

print("ÏõêÎ≥∏ Î™®Îç∏Í≥º DPO ÌäúÎãù Î™®Îç∏ Î°úÎî© ÏôÑÎ£å!")

# Îëê Î™®Îç∏ Î™®Îëê Ï∂îÎ°† Î™®ÎìúÎ°ú ÏÑ§Ï†ï
original_model.eval()
tuned_model.eval()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

ÏõêÎ≥∏ Î™®Îç∏Í≥º DPO ÌäúÎãù Î™®Îç∏ Î°úÎî© ÏôÑÎ£å!


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Qwen3ForCausalLM(
      (model): Qwen3Model(
        (embed_tokens): Embedding(151936, 2560)
        (layers): ModuleList(
          (0-35): 36 x Qwen3DecoderLayer(
            (self_attn): Qwen3Attention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2560, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2560, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear(


In [None]:
# --- ÌÖåÏä§Ìä∏ ÌîÑÎ°¨ÌîÑÌä∏ Ï†ïÏùò ---
prompt = """
ÎãπÏã†ÏùÄ Ï†ÑÍ∏∞Ï†ÑÏûêÍ≥µÌïôÎ∂Ä Ï†ÑÎ¨∏Í∞ÄÏûÖÎãàÎã§. ÌöåÎ°úÏù¥Î°†Ïóê Í¥ÄÌï¥ÏÑú Í∞ùÍ¥ÄÏãù Î¨∏Ï†úÎ•º ÌïòÎÇò ÏÉùÏÑ±ÌïòÏÑ∏Ïöî. ÏÉùÏÑ±Ìï† ÎÇ¥Ïö©ÏùÄ Îã§ÏùåÍ≥º Í∞ôÏäµÎãàÎã§.

ÏßàÎ¨∏:
Î≥¥Í∏∞1:
Î≥¥Í∏∞2:
Î≥¥Í∏∞3:
Î≥¥Í∏∞4:
Î≥¥Í∏∞5:
Ï†ïÎãµ:
Ìï¥ÏÑ§:

Î∞òÎìúÏãú 'Ìï¥ÏÑ§' Î∂ÄÎ∂ÑÍπåÏßÄ Î™®Îëê Ìè¨Ìï®ÌïòÏó¨ ÏúÑÏùò ÌòïÏãùÏóê ÎßûÏ∂∞ Ï†ÑÏ≤¥ Î¨∏Ï†úÎ•º ÏôÑÏÑ±Ìï¥ Ï£ºÏÑ∏Ïöî.
"""

# --- Qwen2 Ï±ÑÌåÖ ÌÖúÌîåÎ¶ø Ï†ÅÏö© ---
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt},
]
formatted_prompt = tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
inputs = tokenizer(formatted_prompt, return_tensors="pt").to(original_model.device)

# --- ÏùëÎãµ ÏÉùÏÑ± ÌååÎùºÎØ∏ÌÑ∞ ---
generation_args = {
    "max_new_tokens": 2048,
    "temperature": 0.7,
    "top_p": 0.9,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
}


# ==========================================================
# 1. ÏõêÎ≥∏ Î™®Îç∏ ÏùëÎãµ ÏÉùÏÑ±
# ==========================================================
print("‚úÖ 1. ÏõêÎ≥∏ Qwen3 Î™®Îç∏Ïùò ÏùëÎãµ")
print("==========================================================")
with torch.no_grad():
    original_outputs = original_model.generate(**inputs, **generation_args)
# ÌîÑÎ°¨ÌîÑÌä∏Î•º Ï†úÏô∏Ìïú ÏàúÏàò ÏùëÎãµÎßå ÎîîÏΩîÎî©
original_response = tokenizer.decode(original_outputs[0][len(inputs.input_ids[0]):], skip_special_tokens=True)
print(original_response)


# ==========================================================
# 2. DPO ÌäúÎãùÎêú Î™®Îç∏ ÏùëÎãµ ÏÉùÏÑ±
# ==========================================================
print("\n\n‚úÖ 2. DPO ÌäúÎãùÎêú Î™®Îç∏Ïùò ÏùëÎãµ")
print("==========================================================")
with torch.no_grad():
    tuned_outputs = tuned_model.generate(**inputs, **generation_args)
# ÌîÑÎ°¨ÌîÑÌä∏Î•º Ï†úÏô∏Ìïú ÏàúÏàò ÏùëÎãµÎßå ÎîîÏΩîÎî©
tuned_response = tokenizer.decode(tuned_outputs[0][len(inputs.input_ids[0]):], skip_special_tokens=True)
print(tuned_response)

‚úÖ 1. ÏõêÎ≥∏ Qwen3 Î™®Îç∏Ïùò ÏùëÎãµ
<think>
Okay, the user wants me to create an objective question about circuit theory for an electrical engineering student. Let me start by recalling some key concepts in circuit theory. Maybe something related to Kirchhoff's laws or Ohm's law? Oh, Kirchhoff's Voltage Law (KVL) is a good topic. It states that the sum of all voltages around a closed loop is zero.

I need to make sure the question is clear and the answer is unambiguous. Let me think of a scenario where KVL is applied. For example, a simple loop with a battery and two resistors. The question could ask for the voltage drop across a resistor. 

Wait, the options should be different but plausible. Let me set up the problem. Suppose a loop with a 12V battery, a 2Œ© resistor, and a 4Œ© resistor. The total resistance is 6Œ©, so the current would be 2A. Then the voltage drop across the 2Œ© resistor is 4V, and across the 4Œ© resistor is 8V. 

But the question should be phrased as a multiple-ch

In [None]:
model.push_to_hub("burtenshaw/Qwen3-30B-A3B-python-code")

In [None]:
from huggingface_hub import notebook_login

# ÌóàÍπÖÌéòÏù¥Ïä§ Î°úÍ∑∏Ïù∏ Ï∞ΩÏù¥ Îú®Î©¥ ÌÜ†ÌÅ∞ÏùÑ ÏûÖÎ†•Ìï©ÎãàÎã§.
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv‚Ä¶

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

# --- Í≤ΩÎ°ú Î∞è ID ÏÑ§Ï†ï ---
# 1. Î™®Îì† Í≤ÉÏùò ÏãúÏûëÏ†ê, ÏõêÎ≥∏ Î≤†Ïù¥Ïä§ Î™®Îç∏
base_model_name = "Qwen/Qwen3-4B"

# 2. 1Ï∞®Î°ú ÌäúÎãùÌñàÎçò SFT Î™®Îç∏(Ïñ¥ÎåëÌÑ∞)Ïùò ÌóàÎ∏å ID
my_sft_model_hub_id = "totalcream/ÏóÖÎ°úÎìúÌïú-Î™®Îç∏-Ïù¥Î¶Ñ" # Î≥∏Ïù∏ IDÎ°ú Î≥ÄÍ≤Ω!

# 3. 2Ï∞®Î°ú ÌäúÎãùÌïú ÏµúÏ¢Ö DPO Ïñ¥ÎåëÌÑ∞Ïùò Î°úÏª¨ Í≤ΩÎ°ú
final_dpo_adapter_path = "./my-final-dpo-adapter" # Ïù¥Ï†Ñ Îã®Í≥ÑÏóêÏÑú Ï†ÄÏû•Ìïú Í≤ΩÎ°ú

# --- Î™®Îç∏ Î°úÎî© Î∞è ÏàúÏ∞®Ï†Å Î≥ëÌï© ---
print("Î™®Îç∏ Î≥ëÌï©ÏùÑ ÏãúÏûëÌï©ÎãàÎã§...")

# 1. ÏõêÎ≥∏ Î≤†Ïù¥Ïä§ Î™®Îç∏ÏùÑ bfloat16ÏúºÎ°ú, ÏñëÏûêÌôî ÏóÜÏù¥ Ï†ÑÏ≤¥ Î°úÎìúÌï©ÎãàÎã§.
#    (Î≥ëÌï©ÏùÑ ÏúÑÌï¥ÏÑúÎäî Ï†ÑÏ≤¥ Î™®Îç∏Ïù¥ Î©îÎ™®Î¶¨Ïóê Ïò¨ÎùºÏôÄÏïº Ìï©ÎãàÎã§)
print("1Îã®Í≥Ñ: ÏõêÎ≥∏ Î≤†Ïù¥Ïä§ Î™®Îç∏ Î°úÎî©...")
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

# 2. Î≤†Ïù¥Ïä§ Î™®Îç∏ ÏúÑÏóê SFT Ïñ¥ÎåëÌÑ∞Î•º Î®ºÏ†Ä Ï†ÅÏö©Ìï©ÎãàÎã§.
print("2Îã®Í≥Ñ: SFT Ïñ¥ÎåëÌÑ∞ Ï†ÅÏö©...")
sft_model = PeftModel.from_pretrained(base_model, my_sft_model_hub_id)

# 3. SFT Î™®Îç∏ ÏúÑÏóê ÏµúÏ¢Ö DPO Ïñ¥ÎåëÌÑ∞Î•º Ï†ÅÏö©Ìï©ÎãàÎã§.
print("3Îã®Í≥Ñ: ÏµúÏ¢Ö DPO Ïñ¥ÎåëÌÑ∞ Ï†ÅÏö©...")
dpo_model = PeftModel.from_pretrained(sft_model, final_dpo_adapter_path)


# 4. Î™®Îì† Ïñ¥ÎåëÌÑ∞Ïùò Í∞ÄÏ§ëÏπòÎ•º Î≤†Ïù¥Ïä§ Î™®Îç∏Ïóê ÏôÑÏ†ÑÌûà ÌÜµÌï©Ìï©ÎãàÎã§.
print("4Îã®Í≥Ñ: Î™®Îì† Ïñ¥ÎåëÌÑ∞ Í∞ÄÏ§ëÏπò Î≥ëÌï©...")
merged_model = dpo_model.merge_and_unload()

print("‚úÖ Î™®Îì† Î™®Îç∏ Î≥ëÌï© ÏôÑÎ£å!")

# ÌÜ†ÌÅ¨ÎÇòÏù¥Ï†ÄÎèÑ Î°úÎìúÌï©ÎãàÎã§ (ÏóÖÎ°úÎìú Ïãú ÌïÑÏöî)
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
tokenizer.pad_token = tokenizer.eos_token