In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install "unsloth[torch]" trl datasets accelerate


In [None]:
sft_model_path = "/content/drive/MyDrive/tinyllama-lora-sft-tuned-model"


In [None]:
from unsloth import FastLanguageModel
from trl import DPOTrainer, DPOConfig
from datasets import load_dataset
import torch 

# Load model and tokenizer from your fine-tuned path
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=sft_model_path,
    dtype=torch.float16, 
    load_in_4bit=True  
)


In [None]:
dataset = load_dataset("yitingxie/rlhf-reward-datasets", split="train").select(range(10000))
print(dataset[0])

In [None]:
training_args = DPOConfig(
    output_dir="/content/drive/MyDrive/tinyllama-dpo-trained",
    logging_steps=100,
    bf16=False, 
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1
)


In [None]:
trainer = DPOTrainer(model=model, args=training_args, processing_class=tokenizer, train_dataset=dataset)


In [None]:
trainer.train()


In [None]:
# Load the model with adapters (same as before)
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="/content/drive/MyDrive/tinyllama-dpo-trained/checkpoint-3750",  # Use final checkpoint
    dtype=torch.float16,
    load_in_4bit=False  # Set to False before merging
)

# ✅ Merge the LoRA adapters into the base model
model = model.merge_and_unload()

# ✅ Save the merged model and tokenizer
save_path = "/content/drive/MyDrive/tinyllama-dpo-merged-final"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"✅ Final merged model saved to: {save_path}")

"""
So I have trained the SFT model with LoRA adapters
and in DPO we trained the same SFT model with reward data (still using LoRA)
then finally we merged the adapters into the base model
so the final model has:

base TinyLlama knowledge
SFT instruction knowledge
reward-based DPO tuning”
"""