Enterprise-Grade Implementation: DPO with TRL
To implement this, we use Hugging Face's TRL (Transformer Reinforcement Learning) library.

In [None]:
from trl import DPOTrainer, DPOConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset

# 1. Load Model and Reference
# In DPO, usually model and ref are the same SFT checkpoint initially
model_id = "your-org/sft-llama-3-8b"
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")
ref_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")

# 2. Load Preference Data
# Must have columns: 'prompt', 'chosen', 'rejected'
dataset = load_dataset("json", data_files="internal_prefs.json")

# 3. Define DPO Configuration
dpo_args = DPOConfig(
    beta=0.1,                 # The "Temperature" of DPO. 0.1 is standard. 
                              # Higher beta = more deviation from ref allowed.
    learning_rate=5e-7,       # VERY small LR is crucial for DPO
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    output_dir="./dpo-aligned-model",
)

# 4. Initialize Trainer
trainer = DPOTrainer(
    model=model,
    ref_model=ref_model,
    args=dpo_args,
    train_dataset=dataset,
    tokenizer=tokenizer,
)

# 5. Train
trainer.train()