## More Sources:

https://medium.com/@mauryaanoop3/dpo-fine-tuning-for-enhanced-language-model-performance-466fec349a5e

https://huggingface.co/Qwen/Qwen2.5-3B-Instruct

https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Zephyr_(7B)-DPO.ipynb#scrollTo=EWGFqAo5Q2me

In [None]:
# import libraries
from trl import DPOConfig, DPOTrainer
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

from unsloth import FastLanguageModel, is_bfloat16_supported, PatchDPOTrainer
import torch
from datasets import load_dataset

from vllm import SamplingParams

## Load dataset

In [None]:
# Load dataset
dataset = load_dataset("Intel/orca_dpo_pairs")['train']
original_columns = dataset.column_names

# take a look at the dataset
dataset

## Model and LoRA adapter

In [None]:
max_seq_length = 1024 # Can increase for longer reasoning traces
lora_rank = 64 # Larger rank = smarter, but slower

model_name = "Qwen/Qwen2.5-3B-Instruct"
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = max_seq_length,
    load_in_4bit = True, # False for LoRA 16bit
    fast_inference = True, # Enable vLLM fast inference
    max_lora_rank = lora_rank,
    gpu_memory_utilization = 0.5, # Reduce if out of memory
)

model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = [
        "q_proj", "k_proj"], # Remove QKVO if out of memory
    lora_alpha = lora_rank,
    use_gradient_checkpointing = "unsloth", # Enable long context finetuning
    random_state = 3407,
)

## Data Prep

In [None]:
# Helper function to format the dataset
def chatml_format(example, default_system = "You are a helpful assistant."):
    system_message = example['system'] if len(example['system']) > 0 else default_system

    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": example['question']}
    ]

    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    return {
        "prompt": prompt,
        "chosen": example['chosen'],
        "rejected": example['rejected'],
    }

In [None]:
# Applying formatting to the dataset
dataset = dataset.map(
    chatml_format,
    remove_columns=original_columns
)

In [None]:
# Displaying a sample from the dataset

sample = dataset[1]
print('prompt:\n', sample['prompt'])
print('chosen:\n', sample['chosen']  )
print('rejected:\n', sample['rejected'])

In [None]:
# Set up text generation pipeline
generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device='cuda')

# Generate output
outputs = generator(sample['prompt'], max_length=100, truncation=True, num_return_sequences=1, temperature=0.7)

print(outputs[0]['generated_text'])

In [None]:
# Split into train and validation
split_dataset = dataset.train_test_split(test_size=0.1, seed=42)  # 10% for validation
train_dataset = split_dataset["train"]
val_dataset = split_dataset["test"]

In [None]:
# Example prompt
sample = val_dataset[1]

# Set up text generation pipeline
generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device='cuda')

# Generate output
outputs = generator(sample['prompt'], max_length=100, truncation=True, num_return_sequences=1, temperature=0.7)

print(outputs[0]['generated_text'])

In [None]:
# Display dataset sizes
print(train_dataset)
print(val_dataset)

## Training process

In [None]:
# One must patch the DPO Trainer first!
PatchDPOTrainer()

In [None]:
# training arguments
ft_model_name = model_name.split('/')[1].replace("Instruct", "DPO")

training_args = DPOConfig(
    output_dir=ft_model_name,
    logging_steps=25,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    bf16=True,
    num_train_epochs=1,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    save_strategy="epoch",
    eval_strategy="epoch",
    eval_steps=1,
    report_to="none"
)

device = torch.device('cuda')

In [None]:
# Initialize DPO Trainer
dpo_trainer = DPOTrainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = val_dataset,
    processing_class = tokenizer)

In [None]:
# Start training
dpo_trainer.train()

## Inference

In [None]:
# Load the fine-tuned model
ft_model = dpo_trainer.model

In [None]:
# Example prompt
sample = val_dataset[0]

In [None]:
# base model results

# Set up text generation pipeline
generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device='cuda')

# Generate output
outputs = generator(sample['prompt'], max_length=max_seq_length, truncation=True, num_return_sequences=1, temperature=0.7)

print(outputs[0]['generated_text'])

In [None]:
# fine-tune model results

# Set up text generation pipeline
generator = pipeline("text-generation", model=ft_model, tokenizer=tokenizer, device='cuda')

# Generate output
outputs = generator(sample['prompt'], max_length=max_seq_length, truncation=True, num_return_sequences=1, temperature=0.7)

print(outputs[0]['generated_text'])