In [None]:
# 🚀 D–G on facebook/opt-125m — setup
!pip install --upgrade pip
!pip install unsloth transformers accelerate bitsandbytes datasets trl

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from unsloth import FastLanguageModel
from datasets import load_dataset
from trl import DPOTrainer, DPOConfig, ORPOTrainer, ORPOConfig, SFTTrainer

MODEL = "facebook/opt-125m"
MAX_LEN = 512
DEVICE  = "cuda"

# Load & quantize lightly
base, tok = FastLanguageModel.from_pretrained(
    model_name     = MODEL,
    max_seq_length = MAX_LEN,
    load_in_4bit   = True,
    dtype          = torch.float16,
    device_map     = "auto",
)


In [None]:
# ─── Part D: DPO Reward Modeling ─────────────────────────────────────────────
from datasets import load_dataset, Dataset
from trl import DPOTrainer, DPOConfig

# 1) Load your own DPO data:
#    It must have columns: "prompt", "chosen", "rejected".
#    Example JSONL format:
#      {"prompt":"Tell me a joke","chosen":"Why did...","rejected":"Here’s a story..."}
try:
    ds_dpo = load_dataset("json", data_files="dpo_data.jsonl", split="train")
except Exception:
    # fallback to a minimal dummy dataset
    ds_dpo = Dataset.from_list([
        {"prompt":"What is 2+2?","chosen":"4","rejected":"5"},
        {"prompt":"Greet me","chosen":"Hello there!","rejected":"Hi."}
    ])

# 2) Reload a clean LO-RA-patched model
m_dpo, t_dpo = FastLanguageModel.from_pretrained(
    MODEL, max_seq_length=MAX_LEN,
    load_in_4bit=True, dtype=torch.float16, device_map="auto"
)
m_dpo = FastLanguageModel.get_peft_model(
    m_dpo, r=4, target_modules=["q_proj","v_proj"],
    lora_alpha=8, lora_dropout=0.1, bias="none"
)

# 3) DPOConfig & Trainer
cfg = DPOConfig(
    per_device_train_batch_size=2,
    num_train_epochs=1,
    learning_rate=5e-6,
    fp16=True,
    output_dir="./dpo_opt125",
    report_to="none",
)
trainer = DPOTrainer(
    model     = m_dpo,
    ref_model = None,           # uses an internal copy as reference
    train_dataset = ds_dpo,
    tokenizer = t_dpo,
    args      = cfg
)

# 4) Launch
trainer.train()



In [None]:
# ─── Part D (continued): ORPO Reward Modeling ────────────────────────────────
from datasets import load_dataset, Dataset
from trl import ORPOTrainer, ORPOConfig

# 1) Load your ORPO data: expect columns like "prompt", "chosen", "rejected"
#    or whatever format your ORPO JSONL uses.
try:
    ds_orpo = load_dataset("json", data_files="orpo_data.jsonl", split="train")
except Exception:
    # minimal fallback so the code runs end-to-end
    ds_orpo = Dataset.from_list([
        {"prompt":"What’s 3+3?","chosen":"6","rejected":"5"},
        {"prompt":"Say hello","chosen":"Hello there!","rejected":"Hi."},
    ])

# 2) Reload a fresh LoRA-patched model
m_orpo, t_orpo = FastLanguageModel.from_pretrained(
    MODEL,
    max_seq_length = MAX_LEN,
    load_in_4bit   = True,
    dtype          = torch.float16,
    device_map     = "auto",
)
m_orpo = FastLanguageModel.get_peft_model(
    m_orpo,
    r              = 4,
    target_modules = ["q_proj","v_proj"],
    lora_alpha     = 8,
    lora_dropout   = 0.1,
    bias           = "none",
)

# 3) ORPOConfig & Trainer
orpo_cfg = ORPOConfig(
    per_device_train_batch_size=2,
    num_train_epochs=1,
    learning_rate=5e-6,
    fp16=True,
    output_dir="./orpo_opt125",
    report_to="none",
)
trainer_orpo = ORPOTrainer(
    model         = m_orpo,
    tokenizer     = t_orpo,
    train_dataset = ds_orpo,
    args          = orpo_cfg
)

# 4) Launch
trainer_orpo.train()


In [None]:
from pathlib import Path
from peft import PeftModel

ckpt_dir = Path("lora_chat")  # no leading dot in the string
model_ckpt = PeftModel.from_pretrained(base, ckpt_dir)

resume_args = TrainingArguments(
    output_dir="./resume_opt125",
    per_device_train_batch_size=2,
    num_train_epochs=1,
    fp16=True,
    report_to="none",
)

trainer_resume = SFTTrainer(
    model=model_ckpt,
    tokenizer=tok,
    train_dataset=ds,         # e.g. reuse ds from DPO or any other
    eval_dataset=ds.select(range(50)),
    dataset_text_field="text",
    max_seq_length=MAX_LEN,
    args=resume_args,
)
trainer_resume.train()


In [None]:
# Finetune on a small mental-health JSONL
ds_mh = load_dataset("json", data_files="mental_health.jsonl")["train"].select(range(200))
ds_mh = ds_mh.map(lambda x: {"text": f"### Instruction:\nSupport kindly\n### Input:\n{x['prompt']}\n### Response:\n{x['response']}{tok.eos_token}"}, batched=False)

m_mh = FastLanguageModel.get_peft_model(
    base, r=4, target_modules=["q_proj","v_proj"], lora_alpha=8, lora_dropout=0.1, bias="none"
)
mh_args = TrainingArguments(
    output_dir="./mh_opt125",
    per_device_train_batch_size=2,
    num_train_epochs=1,
    fp16=True,
    report_to="none",
)
trainer_mh = SFTTrainer(
    model=m_mh, tokenizer=tok, train_dataset=ds_mh, eval_dataset=ds_mh.select(range(50)),
    dataset_text_field="text", max_seq_length=MAX_LEN, args=mh_args
)
trainer_mh.train()

# Export to Ollama (example)
m_mh.save_pretrained("mh_opt125_adapter")
tok.save_pretrained("mh_opt125_adapter")
# !ollama create mh_opt125 -f mh_opt125_adapter
