# Complete SFT → DPO Pipeline with Model Comparison



## 1. Installation & Setup

In [1]:
%%capture
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth  # Do this in local & cloud setups
else:
    import torch; v = re.match(r'[\d]{1,}\.[\d]{1,}', str(torch.__version__)).group(0)
    xformers = 'xformers==' + {'2.10':'0.0.34','2.9':'0.0.33.post1','2.8':'0.0.32.post2'}.get(v, "0.0.34")
    !pip install sentencepiece protobuf "datasets==4.3.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth_zoo bitsandbytes accelerate {xformers} peft trl triton unsloth
!pip install transformers==4.56.2
!pip install --no-deps trl==0.22.2
!pip install rouge-score bert-score

In [2]:
print("✅ All packages installed successfully!")

✅ All packages installed successfully!


## 2. Load Base Model

In [3]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 4096
dtype = None  # None for auto detection
load_in_4bit = True

# Load base model
base_model, base_tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3-8B",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

# Set chat template
if base_tokenizer.chat_template is None:
    base_tokenizer.chat_template = "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}{% endif %}"

print("✅ Base model loaded!")

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2026.1.4: Fast Llama patching. Transformers: 4.56.2.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.494 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.10.0+cu128. CUDA: 8.0. CUDA Toolkit: 12.8. Triton: 3.6.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.34. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/198 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

✅ Base model loaded!


## 3. Data Preparation & Splitting

In [4]:
import pandas as pd
from datasets import Dataset

# Load dataset
df = pd.read_csv("data 4.csv")

# Clean dataset - remove rows with null or empty values
def is_blank(x):
    return not isinstance(x, str) or x.strip() == ""

df_clean = df[
    df["text"].apply(lambda x: isinstance(x, str) and x.strip() != "") &
    df["summary"].apply(lambda x: isinstance(x, str) and x.strip() != "") &
    df["generated_summary"].apply(lambda x: isinstance(x, str) and x.strip() != "")
].reset_index(drop=True)

print(f"Original dataset size: {len(df)}")
print(f"Cleaned dataset size: {len(df_clean)}")
print(f"Dropped rows: {len(df) - len(df_clean)}")

Original dataset size: 5000
Cleaned dataset size: 5000
Dropped rows: 0


In [5]:
# Convert to Hugging Face dataset
dataset = Dataset.from_pandas(df_clean)

# Split: 60% SFT, 30% DPO, 10% Test
# First split: 90% train, 10% test
split1 = dataset.train_test_split(test_size=0.02, seed=42)
test_dataset = split1['test']

# Second split: From remaining 90%, split into 60% SFT and 30% DPO
# This means 66.67% of the 90% goes to SFT, 33.33% goes to DPO
split2 = split1['train'].train_test_split(test_size=0.3333, seed=42)
sft_dataset = split2['train']
dpo_dataset = split2['test']

print("\n" + "="*60)
print("DATASET SPLIT SUMMARY")
print("="*60)
print(f"Total samples: {len(dataset)}")
print(f"SFT training: {len(sft_dataset)} ({len(sft_dataset)/len(dataset)*100:.1f}%)")
print(f"DPO training: {len(dpo_dataset)} ({len(dpo_dataset)/len(dataset)*100:.1f}%)")
print(f"Test: {len(test_dataset)} ({len(test_dataset)/len(dataset)*100:.1f}%)")
print("="*60)


DATASET SPLIT SUMMARY
Total samples: 5000
SFT training: 3266 (65.3%)
DPO training: 1634 (32.7%)
Test: 100 (2.0%)


## 4. Prepare SFT Dataset

In [6]:
def prepare_sft_format(examples):
    """Format data for supervised fine-tuning"""
    messages_list = []
    
    for text, summary in zip(examples["text"], examples["summary"]):
        user_prompt = (
            "You are an engaging writer.\n\n"
            "A spotlight is a short narrative teaser written as a single paragraph. "
            "It highlights ONE intriguing angle and sparks curiosity without summarizing.\n\n"
            "Write a spotlight (1-2 sentences).\n\n"
            f"### Document:\n{text}"
        )
        
        messages = [
            {"role": "user", "content": user_prompt},
            {"role": "assistant", "content": summary.strip()},
        ]
        
        messages_list.append(messages)
    
    return {"messages": messages_list}

# Apply formatting
sft_formatted = sft_dataset.map(
    prepare_sft_format,
    batched=True,
    remove_columns=["text", "summary", "generated_summary"],
    desc="Formatting SFT data",
)

print(f"\n✅ SFT dataset prepared: {len(sft_formatted)} samples")
print("\nSample:")
print(sft_formatted[0]['messages'])

Formatting SFT data:   0%|          | 0/3266 [00:00<?, ? examples/s]


✅ SFT dataset prepared: 3266 samples

Sample:
[{'content': 'You are an engaging writer.\n\nA spotlight is a short narrative teaser written as a single paragraph. It highlights ONE intriguing angle and sparks curiosity without summarizing.\n\nWrite a spotlight (1-2 sentences).\n\n### Document:\nLos Angeles (CNN) -- In the final two hours of a dramatic standoff with rogue ex-Los Angeles police officer Christopher Dorner, San Bernardino County sheriff\'s deputies did not fire a single gunshot during their raid of a compound where he barricaded himself after killing one deputy and seriously wounding another, according to dispatch logs. When the SWAT team arrived on February 12, a robot-controlled tractor tore down blood-spattered walls of the vacated home near Big Bear, offering tactical teams a clean view inside the cabin, logs show. The redacted transcripts detail the chase that began after a 911 call from a Big Bear couple whom Dorner had held hostage at gunpoint and hogtied before fle

In [7]:
# Apply chat template for SFT
def apply_sft_chat_template(example, tokenizer):
    """Apply chat template to messages"""
    messages = example["messages"]
    
    # Add empty system message if needed
    if messages[0]["role"] != "system":
        messages.insert(0, {"role": "system", "content": ""})
    
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=False,
    )
    
    return {"text": text}

sft_formatted = sft_formatted.map(
    lambda x: apply_sft_chat_template(x, base_tokenizer),
    remove_columns=["messages"],
    desc="Applying chat template",
)

print("\n✅ Chat template applied")
print("\nSample formatted text:")
print(sft_formatted[0]['text'][:500] + "...")

Applying chat template:   0%|          | 0/3266 [00:00<?, ? examples/s]


✅ Chat template applied

Sample formatted text:
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

<|eot_id|><|start_header_id|>user<|end_header_id|>

You are an engaging writer.

A spotlight is a short narrative teaser written as a single paragraph. It highlights ONE intriguing angle and sparks curiosity without summarizing.

Write a spotlight (1-2 sentences).

### Document:
Los Angeles (CNN) -- In the final two hours of a dramatic standoff with rogue ex-Los Angeles police officer Christopher Dorner, San Bernardino County sheriff's...


## 5. Stage 1: Supervised Fine-Tuning (SFT)

In [8]:
# Prepare model for SFT
sft_model, sft_tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3-8B",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

# Set chat template
if sft_tokenizer.chat_template is None:
    sft_tokenizer.chat_template = base_tokenizer.chat_template

# Add LoRA adapters
sft_model = FastLanguageModel.get_peft_model(
    sft_model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    lora_alpha=32,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

print("\n✅ SFT model prepared with LoRA adapters")

==((====))==  Unsloth 2026.1.4: Fast Llama patching. Transformers: 4.56.2.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.494 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.10.0+cu128. CUDA: 8.0. CUDA Toolkit: 12.8. Triton: 3.6.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.34. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2026.1.4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.



✅ SFT model prepared with LoRA adapters


In [9]:
from trl import SFTTrainer, SFTConfig

sft_trainer = SFTTrainer(
    model=sft_model,
    tokenizer=sft_tokenizer,
    train_dataset=sft_formatted,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    packing=False,
    args=SFTConfig(
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        warmup_ratio=0.1,
        num_train_epochs=2,
        learning_rate=2e-4,
        logging_steps=10,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="cosine",
        seed=3407,
        output_dir="sft_outputs",
        report_to="none",
    ),
)

print("\n" + "="*60)
print("STARTING SFT TRAINING")
print("="*60)

Unsloth: Tokenizing ["text"] (num_proc=64):   0%|          | 0/3266 [00:00<?, ? examples/s]



🦥 Unsloth: Padding-free auto-enabled, enabling faster training.

STARTING SFT TRAINING


In [10]:
# Train SFT model
sft_stats = sft_trainer.train()

print("\n" + "="*60)
print("SFT TRAINING COMPLETED")
print("="*60)
print(f"Training time: {sft_stats.metrics['train_runtime']:.2f} seconds")
print(f"Training time: {sft_stats.metrics['train_runtime']/60:.2f} minutes")

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 3,266 | Num Epochs = 2 | Total steps = 410
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 41,943,040 of 8,072,204,288 (0.52% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
10,2.407
20,2.1892
30,1.9843
40,1.9204
50,1.9298
60,1.8964
70,1.9137
80,1.8986
90,1.897
100,1.8766



SFT TRAINING COMPLETED
Training time: 1807.74 seconds
Training time: 30.13 minutes


In [11]:
# Save SFT model
sft_model.save_pretrained("llama3_spotlight_sft")
sft_tokenizer.save_pretrained("llama3_spotlight_sft")

print("\n✅ SFT model saved to 'llama3_spotlight_sft'")


✅ SFT model saved to 'llama3_spotlight_sft'


## 6. Prepare DPO Dataset

In [12]:
import re
from typing import Literal

def apply_chat_template(
    example,
    tokenizer,
    task: Literal["sft", "generation", "rm", "dpo"] = "sft",
    assistant_prefix="<|assistant|>\n",
):
    def _strip_prefix(s, pattern):
        return re.sub(f"^{re.escape(pattern)}", "", s)

    if task == "dpo":
        if all(k in example.keys() for k in ("chosen", "rejected")):
            # Extract prompt messages
            prompt_messages = [
                [msg for msg in example["chosen"] if msg["role"] == "user"][0]
            ]
            
            # Insert system message if needed
            if example["chosen"][0]["role"] != "system":
                prompt_messages.insert(0, {"role": "system", "content": ""})
            else:
                prompt_messages.insert(0, example["chosen"][0])
            
            chosen_messages = example["chosen"][1:]
            rejected_messages = example["rejected"][1:]
            
            example["text_chosen"] = tokenizer.apply_chat_template(
                chosen_messages, tokenize=False
            )
            example["text_rejected"] = tokenizer.apply_chat_template(
                rejected_messages, tokenize=False
            )
            example["text_prompt"] = tokenizer.apply_chat_template(
                prompt_messages, tokenize=False, add_generation_prompt=True
            )
            
            example["text_chosen"] = _strip_prefix(
                example["text_chosen"], assistant_prefix
            )
            example["text_rejected"] = _strip_prefix(
                example["text_rejected"], assistant_prefix
            )
        else:
            raise ValueError(
                f"Could not format example as dialogue for `dpo` task! Require `[chosen, rejected]` keys but found {list(example.keys())}"
            )
    
    return example

In [13]:
def prepare_dpo_format(examples):
    """Prepare data for DPO training"""
    chosen_messages = []
    rejected_messages = []

    for text, summary, generated_summary in zip(
        examples["text"],
        examples["summary"],
        examples["generated_summary"]
    ):
        generated_summary_cleaned = generated_summary.replace("[SUMMARY]", "").strip()

        user_prompt = (
            "You are an engaging writer.\n\n"
            "A spotlight is a short narrative teaser written as a single paragraph. "
            "It highlights ONE intriguing angle and sparks curiosity without summarizing.\n\n"
            "Write a spotlight (1-2 sentences).\n\n"
            f"### Document:\n{text}"
        )

        # Chosen: human-written summary (better)
        chosen_messages.append([
            {"role": "user", "content": user_prompt},
            {"role": "assistant", "content": summary.strip()},
        ])

        # Rejected: generated summary (worse)
        rejected_messages.append([
            {"role": "user", "content": user_prompt},
            {"role": "assistant", "content": generated_summary_cleaned},
        ])

    return {
        "chosen": chosen_messages,
        "rejected": rejected_messages,
    }

# Apply DPO formatting
dpo_formatted = dpo_dataset.map(
    prepare_dpo_format,
    batched=True,
    num_proc=4,
    remove_columns=["text", "summary", "generated_summary"],
    desc="Preparing DPO format",
)

print(f"\n✅ DPO dataset prepared: {len(dpo_formatted)} samples")

Preparing DPO format (num_proc=4):   0%|          | 0/1634 [00:00<?, ? examples/s]


✅ DPO dataset prepared: 1634 samples


In [14]:
# Apply chat template for DPO
column_names = list(dpo_formatted.features)
dpo_formatted = dpo_formatted.map(
    apply_chat_template,
    fn_kwargs={"tokenizer": sft_tokenizer, "task": "dpo"},
    num_proc=4,
    remove_columns=column_names,
    desc="Applying chat template for DPO",
)

# Rename columns
dpo_formatted = dpo_formatted.rename_columns(
    {
        "text_prompt": "prompt",
        "text_chosen": "chosen",
        "text_rejected": "rejected",
    }
)

print("\n✅ DPO dataset formatted")
print("\nSample DPO data:")
print(f"Prompt length: {len(dpo_formatted[0]['prompt'])}")
print(f"Chosen length: {len(dpo_formatted[0]['chosen'])}")
print(f"Rejected length: {len(dpo_formatted[0]['rejected'])}")

Applying chat template for DPO (num_proc=4):   0%|          | 0/1634 [00:00<?, ? examples/s]


✅ DPO dataset formatted

Sample DPO data:
Prompt length: 3122
Chosen length: 278
Rejected length: 345


## 7. Stage 2: Direct Preference Optimization (DPO)

In [15]:
# Load the SFT model for DPO training
from unsloth import PatchDPOTrainer
PatchDPOTrainer()

dpo_model, dpo_tokenizer = FastLanguageModel.from_pretrained(
    model_name="llama3_spotlight_sft",  # Load our saved SFT model
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

# Set chat template
if dpo_tokenizer.chat_template is None:
    dpo_tokenizer.chat_template = sft_tokenizer.chat_template

print("\n✅ SFT model loaded for DPO training")

==((====))==  Unsloth 2026.1.4: Fast Llama patching. Transformers: 4.56.2.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.494 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.10.0+cu128. CUDA: 8.0. CUDA Toolkit: 12.8. Triton: 3.6.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.34. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!

✅ SFT model loaded for DPO training


In [16]:
# Add LoRA for DPO (on top of SFT)
dpo_model = FastLanguageModel.get_peft_model(
    dpo_model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    lora_alpha=32,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

print("✅ LoRA adapters added for DPO")

Unsloth: Already have LoRA adapters! We shall skip this step.


✅ LoRA adapters added for DPO


In [17]:
from trl import DPOTrainer, DPOConfig

dpo_trainer = DPOTrainer(
    model=dpo_model,
    ref_model=None,  # Use implicit reference model
    args=DPOConfig(
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        warmup_ratio=0.1,
        num_train_epochs=2,
        learning_rate=5e-6,
        logging_steps=10,
        optim="adamw_8bit",
        weight_decay=0.0,
        lr_scheduler_type="linear",
        seed=42,
        output_dir="dpo_outputs",
        report_to="none",
    ),
    beta=0.1,
    train_dataset=dpo_formatted,
    tokenizer=dpo_tokenizer,
    max_length=1024,
    max_prompt_length=512,
)

print("\n" + "="*60)
print("STARTING DPO TRAINING")
print("="*60)

Extracting prompt in train dataset (num_proc=64):   0%|          | 0/1634 [00:00<?, ? examples/s]

Applying chat template to train dataset (num_proc=64):   0%|          | 0/1634 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=64):   0%|          | 0/1634 [00:00<?, ? examples/s]




STARTING DPO TRAINING


In [18]:
# Train DPO model
dpo_stats = dpo_trainer.train()

print("\n" + "="*60)
print("DPO TRAINING COMPLETED")
print("="*60)
print(f"Training time: {dpo_stats.metrics['train_runtime']:.2f} seconds")
print(f"Training time: {dpo_stats.metrics['train_runtime']/60:.2f} minutes")

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,634 | Num Epochs = 2 | Total steps = 206
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 41,943,040 of 8,072,204,288 (0.52% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / chosen,logps / rejected,logits / chosen,logits / rejected,eval_logits / chosen,eval_logits / rejected,nll_loss
10,0.2096,1.830216,-0.013214,0.9625,1.84343,-196.439774,-182.500626,-0.564625,-0.539653,0,0,0
20,0.1584,1.677019,-0.505755,0.9875,2.182774,-200.013397,-192.230576,-0.551604,-0.513899,No Log,No Log,No Log
30,0.0798,1.997966,-1.362199,0.9875,3.360165,-199.11853,-216.445068,-0.529569,-0.513943,No Log,No Log,No Log
40,0.0158,2.127985,-2.78575,1.0,4.913735,-199.669098,-225.341553,-0.557298,-0.558136,No Log,No Log,No Log
50,0.0078,1.998144,-4.298892,1.0,6.297035,-199.584793,-231.169479,-0.53304,-0.501671,No Log,No Log,No Log
60,0.0099,1.623006,-5.300343,1.0,6.923349,-205.200729,-250.791183,-0.552743,-0.545036,No Log,No Log,No Log
70,0.0042,1.780026,-5.800466,1.0,7.580492,-202.441803,-248.02066,-0.599831,-0.532425,No Log,No Log,No Log
80,0.003,1.534922,-6.475748,1.0,8.01067,-202.425674,-249.31572,-0.559006,-0.517942,No Log,No Log,No Log
90,0.0482,1.288625,-6.994647,0.99375,8.28327,-211.590485,-290.934631,-0.595162,-0.576117,No Log,No Log,No Log
100,0.0048,1.560372,-7.058969,1.0,8.619341,-208.779877,-269.996307,-0.578668,-0.545053,No Log,No Log,No Log



DPO TRAINING COMPLETED
Training time: 1461.68 seconds
Training time: 24.36 minutes


In [19]:
# Save DPO model
dpo_model.save_pretrained("llama3_spotlight_sft_dpo")
dpo_tokenizer.save_pretrained("llama3_spotlight_sft_dpo")

print("\n✅ SFT+DPO model saved to 'llama3_spotlight_sft_dpo'")


✅ SFT+DPO model saved to 'llama3_spotlight_sft_dpo'


## 8. Prepare Test Dataset

In [20]:
# Prepare test dataset in DPO format (to get prompts and references)
test_formatted = test_dataset.map(
    prepare_dpo_format,
    batched=True,
    num_proc=4,
    remove_columns=["text", "summary", "generated_summary"],
    desc="Preparing test data",
)

column_names = list(test_formatted.features)
test_formatted = test_formatted.map(
    apply_chat_template,
    fn_kwargs={"tokenizer": dpo_tokenizer, "task": "dpo"},
    num_proc=4,
    remove_columns=column_names,
    desc="Applying chat template for test",
)

test_formatted = test_formatted.rename_columns(
    {
        "text_prompt": "prompt",
        "text_chosen": "chosen",
        "text_rejected": "rejected",
    }
)

print(f"\n✅ Test dataset prepared: {len(test_formatted)} samples")

Preparing test data (num_proc=4):   0%|          | 0/100 [00:00<?, ? examples/s]

Applying chat template for test (num_proc=4):   0%|          | 0/100 [00:00<?, ? examples/s]


✅ Test dataset prepared: 100 samples


## 9. Model Comparison: Base vs SFT vs SFT+DPO

In [21]:
print("\n" + "="*80)
print("LOADING ALL MODELS FOR COMPARISON")
print("="*80)

# 1. Load Base Model
print("\n1. Loading Base Model...")
base_model_eval, base_tokenizer_eval = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3-8B",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)
if base_tokenizer_eval.chat_template is None:
    base_tokenizer_eval.chat_template = dpo_tokenizer.chat_template
FastLanguageModel.for_inference(base_model_eval)
print("✅ Base model loaded")

# 2. Load SFT Model
print("\n2. Loading SFT Model...")
sft_model_eval, sft_tokenizer_eval = FastLanguageModel.from_pretrained(
    model_name="llama3_spotlight_sft",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)
FastLanguageModel.for_inference(sft_model_eval)
print("✅ SFT model loaded")

# 3. Load SFT+DPO Model
print("\n3. Loading SFT+DPO Model...")
sft_dpo_model_eval, sft_dpo_tokenizer_eval = FastLanguageModel.from_pretrained(
    model_name="llama3_spotlight_sft_dpo",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)
FastLanguageModel.for_inference(sft_dpo_model_eval)
print("✅ SFT+DPO model loaded")

print("\n" + "="*80)
print("ALL MODELS LOADED AND READY FOR EVALUATION")
print("="*80)


LOADING ALL MODELS FOR COMPARISON

1. Loading Base Model...
==((====))==  Unsloth 2026.1.4: Fast Llama patching. Transformers: 4.56.2.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.494 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.10.0+cu128. CUDA: 8.0. CUDA Toolkit: 12.8. Triton: 3.6.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.34. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
✅ Base model loaded

2. Loading SFT Model...
==((====))==  Unsloth 2026.1.4: Fast Llama patching. Transformers: 4.56.2.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.494 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.10.0+cu128. CUDA: 8.0. CUDA Toolkit: 12.8. Triton: 3.6.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.34. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore download

In [22]:
from tqdm import tqdm

def generate_spotlight(model, tokenizer, prompt_text, max_new_tokens=256):
    """Generate spotlight given the formatted prompt"""
    inputs = tokenizer(
        prompt_text,
        return_tensors="pt",
        truncation=True,
        max_length=1024,
    ).to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    
    generated_text = tokenizer.decode(
        outputs[0][inputs['input_ids'].shape[1]:],
        skip_special_tokens=True
    )
    
    return generated_text.strip()

In [23]:
print("\n" + "="*80)
print("RUNNING INFERENCE ON TEST SET")
print("="*80)

num_samples = len(test_formatted)
print(f"\nEvaluating on {num_samples} test samples...\n")

base_predictions = []
sft_predictions = []
sft_dpo_predictions = []
references = []

for i in tqdm(range(num_samples), desc="Generating predictions"):
    example = test_formatted[i]
    prompt_text = example["prompt"]
    ref_text = example["chosen"]
    
    # Generate with base model
    base_pred = generate_spotlight(base_model_eval, base_tokenizer_eval, prompt_text)
    
    # Generate with SFT model
    sft_pred = generate_spotlight(sft_model_eval, sft_tokenizer_eval, prompt_text)
    
    # Generate with SFT+DPO model
    sft_dpo_pred = generate_spotlight(sft_dpo_model_eval, sft_dpo_tokenizer_eval, prompt_text)
    
    references.append(ref_text)
    base_predictions.append(base_pred)
    sft_predictions.append(sft_pred)
    sft_dpo_predictions.append(sft_dpo_pred)

print(f"\n✅ Generated {len(base_predictions)} predictions for each model!")


RUNNING INFERENCE ON TEST SET

Evaluating on 100 test samples...



Generating predictions:   0%|                                                          | 0/100 [00:00<?, ?it/s]Generating predictions:   1%|▌                                                 | 1/100 [00:26<43:34, 26.41s/it]Generating predictions:   2%|█                                                 | 2/100 [01:01<51:40, 31.64s/it]Generating predictions:   3%|█▌                                                | 3/100 [01:36<53:33, 33.13s/it]Generating predictions:   4%|██                                                | 4/100 [02:02<48:38, 30.40s/it]Generating predictions:   5%|██▌                                               | 5/100 [02:33<48:00, 30.32s/it]Generating predictions:   6%|███                                               | 6/100 [03:07<49:59, 31.91s/it]Generating predictions:   7%|███▌                                              | 7/100 [03:43<51:17, 33.09s/it]Generating predictions:   8%|████                                              | 8/100 [04:18<51:50, 33


✅ Generated 100 predictions for each model!





## 10. Calculate Metrics

In [24]:
print("\n" + "="*80)
print("CALCULATING ROUGE SCORES")
print("="*80)

from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

def calculate_rouge_scores(predictions, references):
    """Calculate average ROUGE scores"""
    rouge1_scores = []
    rouge2_scores = []
    rougeL_scores = []
    
    for pred, ref in zip(predictions, references):
        scores = scorer.score(ref, pred)
        rouge1_scores.append(scores['rouge1'].fmeasure)
        rouge2_scores.append(scores['rouge2'].fmeasure)
        rougeL_scores.append(scores['rougeL'].fmeasure)
    
    return {
        'rouge1': sum(rouge1_scores) / len(rouge1_scores),
        'rouge2': sum(rouge2_scores) / len(rouge2_scores),
        'rougeL': sum(rougeL_scores) / len(rougeL_scores),
    }

base_rouge = calculate_rouge_scores(base_predictions, references)
sft_rouge = calculate_rouge_scores(sft_predictions, references)
sft_dpo_rouge = calculate_rouge_scores(sft_dpo_predictions, references)

print("\n📊 ROUGE SCORES COMPARISON:")
print("-" * 80)
print(f"{'Metric':<15} {'Base':<15} {'SFT':<15} {'SFT+DPO':<15}")
print("-" * 80)
for metric in ['rouge1', 'rouge2', 'rougeL']:
    print(f"{metric:<15} {base_rouge[metric]:<15.4f} {sft_rouge[metric]:<15.4f} {sft_dpo_rouge[metric]:<15.4f}")
print("-" * 80)

print("\n📈 IMPROVEMENT OVER BASE:")
print("-" * 80)
print(f"{'Metric':<15} {'SFT Improvement':<20} {'SFT+DPO Improvement':<25}")
print("-" * 80)
for metric in ['rouge1', 'rouge2', 'rougeL']:
    sft_imp = ((sft_rouge[metric] - base_rouge[metric]) / base_rouge[metric]) * 100
    dpo_imp = ((sft_dpo_rouge[metric] - base_rouge[metric]) / base_rouge[metric]) * 100
    print(f"{metric:<15} {sft_imp:+.2f}%{'':<15} {dpo_imp:+.2f}%")
print("-" * 80)


CALCULATING ROUGE SCORES

📊 ROUGE SCORES COMPARISON:
--------------------------------------------------------------------------------
Metric          Base            SFT             SFT+DPO        
--------------------------------------------------------------------------------
rouge1          0.1023          0.2405          0.2571         
rouge2          0.0225          0.0846          0.0910         
rougeL          0.0633          0.1490          0.1580         
--------------------------------------------------------------------------------

📈 IMPROVEMENT OVER BASE:
--------------------------------------------------------------------------------
Metric          SFT Improvement      SFT+DPO Improvement      
--------------------------------------------------------------------------------
rouge1          +135.11%                +151.35%
rouge2          +276.12%                +304.42%
rougeL          +135.25%                +149.46%
-------------------------------------------------

In [25]:
print("\n" + "="*80)
print("CALCULATING BERTSCORE")
print("="*80)

from bert_score import score as bert_score

print("\nCalculating BERTScore for base model...")
P_base, R_base, F1_base = bert_score(
    base_predictions, references,
    lang="en", verbose=False,
    device="cuda" if torch.cuda.is_available() else "cpu"
)

print("Calculating BERTScore for SFT model...")
P_sft, R_sft, F1_sft = bert_score(
    sft_predictions, references,
    lang="en", verbose=False,
    device="cuda" if torch.cuda.is_available() else "cpu"
)

print("Calculating BERTScore for SFT+DPO model...")
P_sft_dpo, R_sft_dpo, F1_sft_dpo = bert_score(
    sft_dpo_predictions, references,
    lang="en", verbose=False,
    device="cuda" if torch.cuda.is_available() else "cpu"
)

base_bertscore = {
    'precision': P_base.mean().item(),
    'recall': R_base.mean().item(),
    'f1': F1_base.mean().item(),
}

sft_bertscore = {
    'precision': P_sft.mean().item(),
    'recall': R_sft.mean().item(),
    'f1': F1_sft.mean().item(),
}

sft_dpo_bertscore = {
    'precision': P_sft_dpo.mean().item(),
    'recall': R_sft_dpo.mean().item(),
    'f1': F1_sft_dpo.mean().item(),
}

print("\n📊 BERTSCORE COMPARISON:")
print("-" * 80)
print(f"{'Metric':<15} {'Base':<15} {'SFT':<15} {'SFT+DPO':<15}")
print("-" * 80)
for metric in ['precision', 'recall', 'f1']:
    print(f"{metric:<15} {base_bertscore[metric]:<15.4f} {sft_bertscore[metric]:<15.4f} {sft_dpo_bertscore[metric]:<15.4f}")
print("-" * 80)

print("\n📈 IMPROVEMENT OVER BASE:")
print("-" * 80)
print(f"{'Metric':<15} {'SFT Improvement':<20} {'SFT+DPO Improvement':<25}")
print("-" * 80)
for metric in ['precision', 'recall', 'f1']:
    sft_imp = ((sft_bertscore[metric] - base_bertscore[metric]) / base_bertscore[metric]) * 100
    dpo_imp = ((sft_dpo_bertscore[metric] - base_bertscore[metric]) / base_bertscore[metric]) * 100
    print(f"{metric:<15} {sft_imp:+.2f}%{'':<15} {dpo_imp:+.2f}%")
print("-" * 80)


CALCULATING BERTSCORE

Calculating BERTScore for base model...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Calculating BERTScore for SFT model...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Calculating BERTScore for SFT+DPO model...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



📊 BERTSCORE COMPARISON:
--------------------------------------------------------------------------------
Metric          Base            SFT             SFT+DPO        
--------------------------------------------------------------------------------
precision       0.6507          0.8008          0.8025         
recall          0.6352          0.8045          0.8049         
f1              0.6423          0.8024          0.8035         
--------------------------------------------------------------------------------

📈 IMPROVEMENT OVER BASE:
--------------------------------------------------------------------------------
Metric          SFT Improvement      SFT+DPO Improvement      
--------------------------------------------------------------------------------
precision       +23.06%                +23.32%
recall          +26.64%                +26.71%
f1              +24.93%                +25.10%
--------------------------------------------------------------------------------


## 11. Save Results & Create Summary

In [26]:
import pandas as pd

# Create detailed results DataFrame
results_df = pd.DataFrame({
    'reference': references,
    'base_prediction': base_predictions,
    'sft_prediction': sft_predictions,
    'sft_dpo_prediction': sft_dpo_predictions,
})

results_df.to_csv('model_comparison_results.csv', index=False)
print("✅ Detailed results saved to 'model_comparison_results.csv'")

# Create summary DataFrame
summary_data = {
    'Metric': [
        'ROUGE-1', 'ROUGE-2', 'ROUGE-L',
        'BERTScore-Precision', 'BERTScore-Recall', 'BERTScore-F1'
    ],
    'Base Model': [
        base_rouge['rouge1'], base_rouge['rouge2'], base_rouge['rougeL'],
        base_bertscore['precision'], base_bertscore['recall'], base_bertscore['f1']
    ],
    'SFT Model': [
        sft_rouge['rouge1'], sft_rouge['rouge2'], sft_rouge['rougeL'],
        sft_bertscore['precision'], sft_bertscore['recall'], sft_bertscore['f1']
    ],
    'SFT+DPO Model': [
        sft_dpo_rouge['rouge1'], sft_dpo_rouge['rouge2'], sft_dpo_rouge['rougeL'],
        sft_dpo_bertscore['precision'], sft_dpo_bertscore['recall'], sft_dpo_bertscore['f1']
    ],
}

summary_df = pd.DataFrame(summary_data)
summary_df['SFT Improvement (%)'] = (
    (summary_df['SFT Model'] - summary_df['Base Model']) / summary_df['Base Model'] * 100
).round(2)
summary_df['SFT+DPO Improvement (%)'] = (
    (summary_df['SFT+DPO Model'] - summary_df['Base Model']) / summary_df['Base Model'] * 100
).round(2)

print("\n" + "="*100)
print("FINAL EVALUATION SUMMARY")
print("="*100)
print(summary_df.to_string(index=False))
print("="*100)

summary_df.to_csv('model_comparison_summary.csv', index=False)
print("\n✅ Summary saved to 'model_comparison_summary.csv'")

✅ Detailed results saved to 'model_comparison_results.csv'

FINAL EVALUATION SUMMARY
             Metric  Base Model  SFT Model  SFT+DPO Model  SFT Improvement (%)  SFT+DPO Improvement (%)
            ROUGE-1    0.102273   0.240453       0.257063               135.11                   151.35
            ROUGE-2    0.022505   0.084644       0.091014               276.12                   304.42
            ROUGE-L    0.063325   0.148968       0.157968               135.25                   149.46
BERTScore-Precision    0.650731   0.800776       0.802454                23.06                    23.32
   BERTScore-Recall    0.635228   0.804472       0.804909                26.64                    26.71
       BERTScore-F1    0.642252   0.802378       0.803486                24.93                    25.10

✅ Summary saved to 'model_comparison_summary.csv'


## 12. Sample Predictions Comparison

In [None]:
print("\n" + "="*100)
print("SAMPLE PREDICTIONS COMPARISON")
print("="*100)

num_samples_to_show = min(3, len(references))

for i in range(num_samples_to_show):
    print(f"\n{'='*100}")
    print(f"SAMPLE {i+1}")
    print("="*100)
    
    print(f"\n📝 REFERENCE (Human-written):")
    print(f"{references[i]}")
    
    print(f"\n🤖 BASE MODEL:")
    print(f"{base_predictions[i]}")
    
    print(f"\n🎯 SFT MODEL:")
    print(f"{sft_predictions[i]}")
    
    print(f"\n🏆 SFT+DPO MODEL:")
    print(f"{sft_dpo_predictions[i]}")
    print()

## 13. Optional: Push Models to Hugging Face Hub

In [None]:
# Uncomment and fill in your details to push models to HF Hub

# YOUR_HF_USERNAME = "your_username"
# YOUR_HF_TOKEN = "your_token"

# # Push SFT model
# sft_model.push_to_hub(
#     f"{YOUR_HF_USERNAME}/llama3-spotlight-sft",
#     token=YOUR_HF_TOKEN,
# )
# sft_tokenizer.push_to_hub(
#     f"{YOUR_HF_USERNAME}/llama3-spotlight-sft",
#     token=YOUR_HF_TOKEN,
# )

# # Push SFT+DPO model
# dpo_model.push_to_hub(
#     f"{YOUR_HF_USERNAME}/llama3-spotlight-sft-dpo",
#     token=YOUR_HF_TOKEN,
# )
# dpo_tokenizer.push_to_hub(
#     f"{YOUR_HF_USERNAME}/llama3-spotlight-sft-dpo",
#     token=YOUR_HF_TOKEN,
# )

# print("✅ Models pushed to Hugging Face Hub!")

## 14. Training Summary

In [None]:
print("\n" + "="*100)
print("COMPLETE PIPELINE SUMMARY")
print("="*100)

print("\n📊 DATASET SPLIT:")
print(f"  Total samples: {len(dataset)}")
print(f"  SFT training: {len(sft_dataset)} ({len(sft_dataset)/len(dataset)*100:.1f}%)")
print(f"  DPO training: {len(dpo_dataset)} ({len(dpo_dataset)/len(dataset)*100:.1f}%)")
print(f"  Test: {len(test_dataset)} ({len(test_dataset)/len(dataset)*100:.1f}%)")

print("\n⏱️ TRAINING TIME:")
print(f"  SFT: {sft_stats.metrics['train_runtime']/60:.2f} minutes")
print(f"  DPO: {dpo_stats.metrics['train_runtime']/60:.2f} minutes")
print(f"  Total: {(sft_stats.metrics['train_runtime'] + dpo_stats.metrics['train_runtime'])/60:.2f} minutes")

print("\n💾 SAVED MODELS:")
print("  1. llama3_spotlight_sft (SFT only)")
print("  2. llama3_spotlight_sft_dpo (SFT + DPO)")

print("\n📈 BEST PERFORMING MODEL:")
# Determine best model based on ROUGE-L F1
scores = {
    'Base': base_rouge['rougeL'],
    'SFT': sft_rouge['rougeL'],
    'SFT+DPO': sft_dpo_rouge['rougeL']
}
best_model = max(scores, key=scores.get)
print(f"  🏆 {best_model} (ROUGE-L: {scores[best_model]:.4f})")

print("\n✅ PIPELINE COMPLETED SUCCESSFULLY!")
print("="*100)