<a href="https://colab.research.google.com/github/ChintPatel/CMPE258-HW5/blob/main/HW5_Unsloth_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# 🚀 D–G on facebook/opt-125m — setup
!pip install --upgrade pip
!pip install unsloth transformers accelerate bitsandbytes datasets trl

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from unsloth import FastLanguageModel
from datasets import load_dataset
from trl import DPOTrainer, DPOConfig, ORPOTrainer, ORPOConfig, SFTTrainer

MODEL = "facebook/opt-125m"
MAX_LEN = 512
DEVICE  = "cuda"

# Load & quantize lightly
base, tok = FastLanguageModel.from_pretrained(
    model_name     = MODEL,
    max_seq_length = MAX_LEN,
    load_in_4bit   = True,
    dtype          = torch.float16,
    device_map     = "auto",
)


Collecting pip
  Downloading pip-25.1-py3-none-any.whl.metadata (3.6 kB)
Downloading pip-25.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.1
Collecting unsloth
  Downloading unsloth-2025.4.1-py3-none-any.whl.metadata (46 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting trl
  Downloading trl-0.17.0-py3-none-any.whl.metadata (12 kB)
Collecting unsloth_zoo>=2025.4.1 (from unsloth)
  Downloading unsloth_zoo-2025.4.1-py3-none-any.whl.metadata (8.0 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.2


Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastLanguageModel


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
Unsloth: Failed to patch SmolVLMForConditionalGeneration forward function.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.4.1: Fast Opt patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/251M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/251M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

In [3]:
# ─── Part D: DPO Reward Modeling ─────────────────────────────────────────────
from datasets import load_dataset, Dataset
from trl import DPOTrainer, DPOConfig

# 1) Load your own DPO data:
#    It must have columns: "prompt", "chosen", "rejected".
#    Example JSONL format:
#      {"prompt":"Tell me a joke","chosen":"Why did...","rejected":"Here’s a story..."}
try:
    ds_dpo = load_dataset("json", data_files="dpo_data.jsonl", split="train")
except Exception:
    # fallback to a minimal dummy dataset
    ds_dpo = Dataset.from_list([
        {"prompt":"What is 2+2?","chosen":"4","rejected":"5"},
        {"prompt":"Greet me","chosen":"Hello there!","rejected":"Hi."}
    ])

# 2) Reload a clean LO-RA-patched model
m_dpo, t_dpo = FastLanguageModel.from_pretrained(
    MODEL, max_seq_length=MAX_LEN,
    load_in_4bit=True, dtype=torch.float16, device_map="auto"
)
m_dpo = FastLanguageModel.get_peft_model(
    m_dpo, r=4, target_modules=["q_proj","v_proj"],
    lora_alpha=8, lora_dropout=0.1, bias="none"
)

# 3) DPOConfig & Trainer
cfg = DPOConfig(
    per_device_train_batch_size=2,
    num_train_epochs=1,
    learning_rate=5e-6,
    fp16=True,
    output_dir="./dpo_opt125",
    report_to="none",
)
trainer = DPOTrainer(
    model     = m_dpo,
    ref_model = None,           # uses an internal copy as reference
    train_dataset = ds_dpo,
    tokenizer = t_dpo,
    args      = cfg
)

# 4) Launch
trainer.train()



==((====))==  Unsloth 2025.4.1: Fast Opt patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Making `model.base_model.model.model.decoder` require gradients


Extracting prompt in train dataset (num_proc=2):   0%|          | 0/2 [00:00<?, ? examples/s]

Applying chat template to train dataset (num_proc=2):   0%|          | 0/2 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=2):   0%|          | 0/2 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2 | Num Epochs = 1 | Total steps = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 2 x 1) = 4
 "-____-"     Trainable parameters = 147,456/82,919,424 (0.18% trained)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / chosen,logps / rejected,logits / chosen,logits / rejected,eval_logits / chosen,eval_logits / rejected,nll_loss,aux_loss
1,0.3594,0.028109,0.061186,0.5,-0.033076,-23.773354,-21.229143,-4.628906,-4.375197,0,0,0,0


TrainOutput(global_step=1, training_loss=0.3593853712081909, metrics={'train_runtime': 24.186, 'train_samples_per_second': 0.083, 'train_steps_per_second': 0.041, 'total_flos': 0.0, 'train_loss': 0.3593853712081909, 'epoch': 1.0})

In [5]:
# ─── Part D (continued): ORPO Reward Modeling ────────────────────────────────
from datasets import load_dataset, Dataset
from trl import ORPOTrainer, ORPOConfig

# 1) Load your ORPO data: expect columns like "prompt", "chosen", "rejected"
#    or whatever format your ORPO JSONL uses.
try:
    ds_orpo = load_dataset("json", data_files="orpo_data.jsonl", split="train")
except Exception:
    # minimal fallback so the code runs end-to-end
    ds_orpo = Dataset.from_list([
        {"prompt":"What’s 3+3?","chosen":"6","rejected":"5"},
        {"prompt":"Say hello","chosen":"Hello there!","rejected":"Hi."},
    ])

# 2) Reload a fresh LoRA-patched model
m_orpo, t_orpo = FastLanguageModel.from_pretrained(
    MODEL,
    max_seq_length = MAX_LEN,
    load_in_4bit   = True,
    dtype          = torch.float16,
    device_map     = "auto",
)
m_orpo = FastLanguageModel.get_peft_model(
    m_orpo,
    r              = 4,
    target_modules = ["q_proj","v_proj"],
    lora_alpha     = 8,
    lora_dropout   = 0.1,
    bias           = "none",
)

# 3) ORPOConfig & Trainer
orpo_cfg = ORPOConfig(
    per_device_train_batch_size=2,
    num_train_epochs=1,
    learning_rate=5e-6,
    fp16=True,
    output_dir="./orpo_opt125",
    report_to="none",
)
trainer_orpo = ORPOTrainer(
    model         = m_orpo,
    tokenizer     = t_orpo,
    train_dataset = ds_orpo,
    args          = orpo_cfg
)

# 4) Launch
trainer_orpo.train()


==((====))==  Unsloth 2025.4.1: Fast Opt patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Making `model.base_model.model.model.decoder` require gradients




Map (num_proc=2):   0%|          | 0/2 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/2 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/2 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2 | Num Epochs = 1 | Total steps = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 2 x 1) = 4
 "-____-"     Trainable parameters = 147,456/82,919,424 (0.18% trained)


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / rejected,logps / chosen,logits / rejected,logits / chosen,log_odds_ratio,log_odds_chosen,eval_logits / chosen,eval_logits / rejected,nll_loss
1,5.7888,-0.763952,-0.850799,1.0,0.086846,-8.507986,-7.639523,-2.963972,-3.482432,-0.401684,0.869049,0,0,5.74861


TrainOutput(global_step=1, training_loss=5.788778305053711, metrics={'train_runtime': 6.0245, 'train_samples_per_second': 0.332, 'train_steps_per_second': 0.166, 'total_flos': 0.0, 'train_loss': 5.788778305053711, 'epoch': 1.0})

In [10]:
from pathlib import Path
from peft import PeftModel

ckpt_dir = Path("lora_chat")  # no leading dot in the string
model_ckpt = PeftModel.from_pretrained(base, ckpt_dir)

resume_args = TrainingArguments(
    output_dir="./resume_opt125",
    per_device_train_batch_size=2,
    num_train_epochs=1,
    fp16=True,
    report_to="none",
)

trainer_resume = SFTTrainer(
    model=model_ckpt,
    tokenizer=tok,
    train_dataset=ds,         # e.g. reuse ds from DPO or any other
    eval_dataset=ds.select(range(50)),
    dataset_text_field="text",
    max_seq_length=MAX_LEN,
    args=resume_args,
)
trainer_resume.train()


ValueError: Can't find 'adapter_config.json' at 'lora_chat'

In [11]:
# Finetune on a small mental-health JSONL
ds_mh = load_dataset("json", data_files="mental_health.jsonl")["train"].select(range(200))
ds_mh = ds_mh.map(lambda x: {"text": f"### Instruction:\nSupport kindly\n### Input:\n{x['prompt']}\n### Response:\n{x['response']}{tok.eos_token}"}, batched=False)

m_mh = FastLanguageModel.get_peft_model(
    base, r=4, target_modules=["q_proj","v_proj"], lora_alpha=8, lora_dropout=0.1, bias="none"
)
mh_args = TrainingArguments(
    output_dir="./mh_opt125",
    per_device_train_batch_size=2,
    num_train_epochs=1,
    fp16=True,
    report_to="none",
)
trainer_mh = SFTTrainer(
    model=m_mh, tokenizer=tok, train_dataset=ds_mh, eval_dataset=ds_mh.select(range(50)),
    dataset_text_field="text", max_seq_length=MAX_LEN, args=mh_args
)
trainer_mh.train()

# Export to Ollama (example)
m_mh.save_pretrained("mh_opt125_adapter")
tok.save_pretrained("mh_opt125_adapter")
# !ollama create mh_opt125 -f mh_opt125_adapter


FileNotFoundError: Unable to find '/content/mental_health.jsonl'