### Installation

In [1]:
%%capture
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth  # Do this in local & cloud setups
else:
    import torch; v = re.match(r'[\d]{1,}\.[\d]{1,}', str(torch.__version__)).group(0)
    xformers = 'xformers==' + {'2.10':'0.0.34','2.9':'0.0.33.post1','2.8':'0.0.32.post2'}.get(v, "0.0.34")
    !pip install sentencepiece protobuf "datasets==4.3.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth_zoo bitsandbytes accelerate {xformers} peft trl triton unsloth
!pip install transformers==4.56.2
!pip install --no-deps trl==0.22.2

In [2]:
print("hi")

hi


### Unsloth

In [2]:
# One must patch the DPO Trainer first!
from unsloth import PatchDPOTrainer

PatchDPOTrainer()

In [1]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 4096 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3-8B", # Choose ANY! eg mistralai/Mistral-7B-Instruct-v0.2
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "YOUR_HF_TOKEN", # HF Token for gated models
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2026.1.4: Fast Llama patching. Transformers: 4.56.2.
   \\   /|    NVIDIA A100 80GB PCIe. Num GPUs = 1. Max memory: 79.252 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.10.0+cu128. CUDA: 8.0. CUDA Toolkit: 12.8. Triton: 3.6.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.34. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [3]:
# @title Alignment Handbook utils
import os
import re
from typing import List, Literal, Optional

from datasets import DatasetDict, concatenate_datasets, load_dataset, load_from_disk
from datasets.builder import DatasetGenerationError


DEFAULT_CHAT_TEMPLATE = "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"


def apply_chat_template(
    example,
    tokenizer,
    task: Literal["sft", "generation", "rm", "dpo"] = "sft",
    assistant_prefix = "<|assistant|>\n",
):
    def _strip_prefix(s, pattern):
        # Use re.escape to escape any special characters in the pattern
        return re.sub(f"^{re.escape(pattern)}", "", s)

    if task in ["sft", "generation"]:
        messages = example["messages"]
        # We add an empty system message if there is none
        if messages[0]["role"] != "system":
            messages.insert(0, {"role": "system", "content": ""})
        example["text"] = tokenizer.apply_chat_template(
            messages,
            tokenize = False,
            add_generation_prompt = True if task == "generation" else False,
        )
    elif task == "rm":
        if all(k in example.keys() for k in ("chosen", "rejected")):
            chosen_messages = example["chosen"]
            rejected_messages = example["rejected"]
            # We add an empty system message if there is none
            if chosen_messages[0]["role"] != "system":
                chosen_messages.insert(0, {"role": "system", "content": ""})
            if rejected_messages[0]["role"] != "system":
                rejected_messages.insert(0, {"role": "system", "content": ""})
            example["text_chosen"] = tokenizer.apply_chat_template(
                chosen_messages, tokenize = False
            )
            example["text_rejected"] = tokenizer.apply_chat_template(
                rejected_messages, tokenize = False
            )
        else:
            raise ValueError(
                f"Could not format example as dialogue for `rm` task! Require `[chosen, rejected]` keys but found {list(example.keys())}"
            )
    elif task == "dpo":
        if all(k in example.keys() for k in ("chosen", "rejected")):
            # Compared to reward modeling, we filter out the prompt, so the text is everything after the last assistant token
            prompt_messages = [
                [msg for msg in example["chosen"] if msg["role"] == "user"][0]
            ]
            # Insert system message
            if example["chosen"][0]["role"] != "system":
                prompt_messages.insert(0, {"role": "system", "content": ""})
            else:
                prompt_messages.insert(0, example["chosen"][0])
            # TODO: handle case where chosen/rejected also have system messages
            chosen_messages = example["chosen"][1:]
            rejected_messages = example["rejected"][1:]
            example["text_chosen"] = tokenizer.apply_chat_template(
                chosen_messages, tokenize = False
            )
            example["text_rejected"] = tokenizer.apply_chat_template(
                rejected_messages, tokenize = False
            )
            example["text_prompt"] = tokenizer.apply_chat_template(
                prompt_messages, tokenize = False, add_generation_prompt = True
            )
            example["text_chosen"] = _strip_prefix(
                example["text_chosen"], assistant_prefix
            )
            example["text_rejected"] = _strip_prefix(
                example["text_rejected"], assistant_prefix
            )
        else:
            raise ValueError(
                f"Could not format example as dialogue for `dpo` task! Require `[chosen, rejected]` keys but found {list(example.keys())}"
            )
    else:
        raise ValueError(
            f"Task {task} not supported, please ensure that the provided task is one of {['sft', 'generation', 'rm', 'dpo']}"
        )
    return example


def get_datasets(
    data_config: dict,
    splits: List[str] = ["train", "test"],
    shuffle: bool = True,
) -> DatasetDict:
    """
    Loads one or more datasets with varying training set proportions.

    Args:
        data_config (`DataArguments` or `dict`):
            Dataset configuration and split proportions.
        splits (`List[str]`, *optional*, defaults to `['train', 'test']`):
            Dataset splits to load and mix. Assumes the splits exist in all datasets and have a `train_` or `test_` prefix.
        shuffle (`bool`, *optional*, defaults to `True`):
            Whether to shuffle the training and testing/validation data.

    Returns
        [`DatasetDict`]: The dataset dictionary containing the loaded datasets.
    """

    if type(data_config) is dict:
        # Structure of the input is:
        #     dataset_mixer = {
        #             "dataset1": 0.5,
        #             "dataset1": 0.3,
        #             "dataset1": 0.2,
        #         }
        dataset_mixer = data_config
    else:
        raise ValueError(f"Data config {data_config} not recognized.")

    raw_datasets = mix_datasets(dataset_mixer, splits = splits, shuffle = shuffle)
    return raw_datasets


def mix_datasets(
    dataset_mixer: dict, splits: Optional[List[str]] = None, shuffle = True
) -> DatasetDict:
    """
    Loads and mixes datasets according to proportions specified in `dataset_mixer`.

    Args:
        dataset_mixer (`dict`):
            Dictionary containing the dataset names and their training proportions. By default, all test proportions are 1.
        splits (Optional[List[str]], *optional*, defaults to `None`):
            Dataset splits to load and mix. Assumes the splits exist in all datasets and have a `train_` or `test_` prefix.
        shuffle (`bool`, *optional*, defaults to `True`):
            Whether to shuffle the training and testing/validation data.
    """
    raw_datasets = DatasetDict()
    raw_train_datasets = []
    raw_val_datasets = []
    fracs = []
    for ds, frac in dataset_mixer.items():
        fracs.append(frac)
        for split in splits:
            try:
                # Try first if dataset on a Hub repo
                dataset = load_dataset(ds, split = split)
            except DatasetGenerationError:
                # If not, check local dataset
                dataset = load_from_disk(os.path.join(ds, split))

            if "train" in split:
                raw_train_datasets.append(dataset)
            elif "test" in split:
                raw_val_datasets.append(dataset)
            else:
                raise ValueError(
                    f"Split type {split} not recognized as one of test or train."
                )

    if any(frac < 0 for frac in fracs):
        raise ValueError("Dataset fractions cannot be negative.")

    if len(raw_train_datasets) > 0:
        train_subsets = []
        for dataset, frac in zip(raw_train_datasets, fracs):
            train_subset = dataset.select(range(int(frac * len(dataset))))
            train_subsets.append(train_subset)
        if shuffle:
            raw_datasets["train"] = concatenate_datasets(train_subsets).shuffle(seed = 42)
        else:
            raw_datasets["train"] = concatenate_datasets(train_subsets)
    # No subsampling for test datasets to enable fair comparison across models
    if len(raw_val_datasets) > 0:
        if shuffle:
            raw_datasets["test"] = concatenate_datasets(raw_val_datasets).shuffle(
                seed = 42
            )
        else:
            raw_datasets["test"] = concatenate_datasets(raw_val_datasets)

    if len(raw_datasets) == 0:
        raise ValueError(
            f"Dataset {dataset_mixer} not recognized with split {split}. Check the dataset has been correctly formatted."
        )

    return raw_datasets

<a name="Data"></a>
### Data Prep
We follow Hugging Face's [Alignment Handbook](https://github.com/huggingface/alignment-handbook) for [Zephyr](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta) and use the [Ultra Feedback dataset](https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized), and sample 0.5% of it to speed things up. You can sample the full dataset for a full run.

In [4]:
import pandas as pd

df = pd.read_csv("SCITLDR.csv")

# Count nulls
print("Null counts:")
print(df[["text", "summary", "generated_summary"]].isnull().sum())

# Count empty or whitespace-only strings
def is_blank(x):
    return not isinstance(x, str) or x.strip() == ""

print("\nBlank (null or empty) counts:")
for col in ["text", "summary", "generated_summary"]:
    print(col, df[col].apply(is_blank).sum())


Null counts:
text                  0
summary               0
generated_summary    39
dtype: int64

Blank (null or empty) counts:
text 0
summary 0
generated_summary 39


In [5]:
df_clean = df[
    df["text"].apply(lambda x: isinstance(x, str) and x.strip() != "") &
    df["summary"].apply(lambda x: isinstance(x, str) and x.strip() != "") &
    df["generated_summary"].apply(lambda x: isinstance(x, str) and x.strip() != "")
].reset_index(drop=True)

print("Rows before cleaning:", len(df))
print("Rows after cleaning:", len(df_clean))
print("Dropped rows:", len(df) - len(df_clean))


Rows before cleaning: 973
Rows after cleaning: 934
Dropped rows: 39


In [6]:
from datasets import Dataset

dataset = Dataset.from_pandas(df_clean)

dataset = dataset.train_test_split(test_size=0.05, seed=42)


In [7]:


def prepare_dpo_format(examples):
    chosen_messages = []
    rejected_messages = []

    for text, summary, generated_summary in zip(
        examples["text"],
        examples["summary"],
        examples["generated_summary"]
    ):
        generated_summary_cleaned = generated_summary.replace("[SUMMARY]", "").strip()

        user_prompt = (
            "You are an engaging writer.\n\n"
            "A spotlight is a short narrative teaser written as a single paragraph. "
            "It highlights ONE intriguing angle and sparks curiosity without summarizing.\n\n"
            "Write a spotlight ( 1-2 sentences).\n\n"
            f"### Document:\n{text}"
        )

        chosen_messages.append([
            {"role": "user", "content": user_prompt},
            {"role": "assistant", "content": summary.strip()},
        ])

        rejected_messages.append([
            {"role": "user", "content": user_prompt},
            {"role": "assistant", "content": generated_summary_cleaned},
        ])

    return {
        "chosen": chosen_messages,
        "rejected": rejected_messages,
    }


# Apply the preparation
raw_datasets = dataset.map(
    prepare_dpo_format,
    batched=True,
    num_proc=12,
    remove_columns=["text", "summary", "generated_summary"],
    desc="Preparing data for DPO format",
)

# Set Llama 3 chat template if not already set
if tokenizer.chat_template is None:
    tokenizer.chat_template = "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"

# Apply chat template
column_names = list(raw_datasets["train"].features)
raw_datasets = raw_datasets.map(
    apply_chat_template,
    fn_kwargs={"tokenizer": tokenizer, "task": "dpo"},
    num_proc=12,
    remove_columns=column_names,
    desc="Formatting comparisons with prompt template",
)

# Rename columns to what TRL expects
for split in ["train", "test"]:
    raw_datasets[split] = raw_datasets[split].rename_columns(
        {
            "text_prompt": "prompt",
            "text_chosen": "chosen",
            "text_rejected": "rejected",
        }
    )

Preparing data for DPO format (num_proc=12):   0%|          | 0/887 [00:00<?, ? examples/s]

Preparing data for DPO format (num_proc=12):   0%|          | 0/47 [00:00<?, ? examples/s]

Formatting comparisons with prompt template (num_proc=12):   0%|          | 0/887 [00:00<?, ? examples/s]

Formatting comparisons with prompt template (num_proc=12):   0%|          | 0/47 [00:00<?, ? examples/s]

We shall print a random item from the dataset

In [8]:
import pprint

row = raw_datasets["train"][8]
pprint.pprint(row["prompt"])
pprint.pprint(row["chosen"])
pprint.pprint(row["rejected"])

('<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n'
 '\n'
 '<|eot_id|><|start_header_id|>user<|end_header_id|>\n'
 '\n'
 'You are an engaging writer.\n'
 '\n'
 'A spotlight is a short narrative teaser written as a single paragraph. It '
 'highlights ONE intriguing angle and sparks curiosity without summarizing.\n'
 '\n'
 'Write a spotlight ( 1-2 sentences).\n'
 '\n'
 '### Document:\n'
 'This paper focuses on the synthetic generation of human mobility data in '
 'urban areas. We present a novel and scalable application of Generative '
 'Adversarial Networks (GANs) for modeling and generating human mobility data. '
 'We leverage actual ride requests from ride sharing/hailing services from '
 'four major cities in the US to train our GANs model. Our model captures the '
 'spatial and temporal variability of the ride-request patterns observed for '
 'all four cities on any typical day and over any typical week. Previous works '
 'have succinctly characterized the spatial and t

We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [9]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,
    lora_dropout = 0, # Currently only supports dropout = 0
    bias = "none",    # Currently only supports bias = "none"
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2026.1.4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


<a name="Train"></a>
### Train the DPO model
Now let's train our model. We do 3 epochs on 0.5% of the dataset to speed things up.

In [10]:
# One must patch the DPO Trainer first!
from unsloth import PatchDPOTrainer

PatchDPOTrainer()

In [11]:
from transformers import TrainingArguments
from trl import DPOTrainer, DPOConfig
dpo_trainer = DPOTrainer(
    model = model,
    ref_model = None,
    args = DPOConfig(
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 4,
        warmup_ratio = 0.1,
        num_train_epochs = 2,
        learning_rate = 5e-6,
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.0,
        lr_scheduler_type = "linear",
        seed = 42,
        output_dir = "outputs",
        report_to = "none", # Use TrackIO/WandB etc
    ),
    beta = 0.1,
    train_dataset = raw_datasets["train"],
    # eval_dataset = raw_datasets["test"],
    tokenizer = tokenizer,
    max_length = 1024,
    max_prompt_length = 512,
)

Extracting prompt in train dataset (num_proc=47):   0%|          | 0/887 [00:00<?, ? examples/s]

Applying chat template to train dataset (num_proc=47):   0%|          | 0/887 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=47):   0%|          | 0/887 [00:00<?, ? examples/s]



In [12]:
dpo_trainer.train()


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 887 | Num Epochs = 2 | Total steps = 112
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 41,943,040 of 8,072,204,288 (0.52% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / chosen,logps / rejected,logits / chosen,logits / rejected,eval_logits / chosen,eval_logits / rejected,nll_loss
1,0.6931,0.0,0.0,0.0,0.0,-159.232971,-364.198975,-1.270524,-1.297501,0,0,0
2,0.6931,0.0,0.0,0.0,0.0,-138.449753,-379.281158,-1.308092,-1.277941,No Log,No Log,No Log
3,0.6915,0.005856,0.002223,0.6875,0.003633,-145.173264,-288.661377,-1.291631,-1.24763,No Log,No Log,No Log
4,0.6963,-0.004272,0.001386,0.375,-0.005658,-154.167648,-450.785645,-1.36173,-1.288816,No Log,No Log,No Log
5,0.6785,0.009127,-0.02108,0.6875,0.030207,-150.53714,-290.371887,-1.312384,-1.057185,No Log,No Log,No Log
6,0.6863,0.013015,-0.000916,0.5625,0.013931,-143.910324,-324.714294,-1.226626,-1.228027,No Log,No Log,No Log
7,0.6963,0.005242,0.010958,0.5625,-0.005716,-147.54364,-327.657501,-1.325947,-1.265301,No Log,No Log,No Log
8,0.6962,-0.00112,0.00462,0.375,-0.00574,-158.785034,-318.522552,-1.318402,-1.22786,No Log,No Log,No Log
9,0.6938,-0.008678,-0.008133,0.375,-0.000546,-155.159943,-414.361328,-1.346766,-1.34006,No Log,No Log,No Log
10,0.6909,-0.001413,-0.006243,0.4375,0.00483,-146.806824,-361.603943,-1.289413,-1.26318,No Log,No Log,No Log


TrainOutput(global_step=112, training_loss=0.451698374136218, metrics={'train_runtime': 988.4785, 'train_samples_per_second': 1.795, 'train_steps_per_second': 0.113, 'total_flos': 0.0, 'train_loss': 0.451698374136218, 'epoch': 2.0})

In [13]:
%%capture
!pip install rouge-score bert-score

In [14]:
print("="*80)
print("LOADING TEST DATASET")
print("="*80)

# The test dataset is already in raw_datasets from the data prep step
test_dataset = raw_datasets["test"]

print(f"Test dataset size: {len(test_dataset)}")
print(f"Sample from test set:")
print(test_dataset[0])


LOADING TEST DATASET
Test dataset size: 47
Sample from test set:
{'chosen': '<|begin_of_text|><|start_header_id|>assistant<|end_header_id|>\n\nSide-tuning adapts a pre-trained network by training a lightweight "side" network that is fused with the (unchanged) pre-trained network using a simple additive process.<|eot_id|>', 'rejected': '<|begin_of_text|><|start_header_id|>assistant<|end_header_id|>\n\n1/4, 1/2, and the full parameters of a ResNet-50 for both the base and side networks. We find that the best performance is had when the side network is large and the base is small. This is because the base is only used to provide a coarse estimate, and a large base network will produce a coarse estimate that requires a larger side network to overcome. The optimal α. In the supplementary material we show that the optimal α is not always 0.5. For instance, when the base is a denoising network and the target is curvature, α = 0.6 performs better than 0.5. This is because the optimal α is task

In [15]:
print("\n" + "="*80)
print("LOADING BASE MODEL FOR COMPARISON")
print("="*80)

from unsloth import FastLanguageModel
import torch

# Load the base model (without LoRA adapters)
base_model, base_tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3-8B",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

# Set chat template for base model
if base_tokenizer.chat_template is None:
    base_tokenizer.chat_template = "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"

# Enable inference mode
FastLanguageModel.for_inference(base_model)

print("Base model loaded successfully!")


LOADING BASE MODEL FOR COMPARISON
==((====))==  Unsloth 2026.1.4: Fast Llama patching. Transformers: 4.56.2.
   \\   /|    NVIDIA A100 80GB PCIe. Num GPUs = 1. Max memory: 79.252 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.10.0+cu128. CUDA: 8.0. CUDA Toolkit: 12.8. Triton: 3.6.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.34. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Base model loaded successfully!


In [16]:

print("\n" + "="*80)
print("PREPARING FINE-TUNED MODEL FOR INFERENCE")
print("="*80)

# The model variable already contains our fine-tuned model
# Just enable inference mode
FastLanguageModel.for_inference(model)

print("Fine-tuned model ready for inference!")


PREPARING FINE-TUNED MODEL FOR INFERENCE
Fine-tuned model ready for inference!


In [17]:
from tqdm import tqdm

def generate_spotlight(model, tokenizer, prompt_text, max_new_tokens=256):
    """Generate spotlight given the formatted prompt"""
    
    # Tokenize the prompt
    inputs = tokenizer(
        prompt_text,
        return_tensors="pt",
        truncation=True,
        max_length=1024,
    ).to(model.device)
    
    # Generate
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    
    # Decode only the generated part (skip the prompt)
    generated_text = tokenizer.decode(
        outputs[0][inputs['input_ids'].shape[1]:], 
        skip_special_tokens=True
    )
    
    return generated_text.strip()


In [18]:
print("\n" + "="*80)
print("RUNNING INFERENCE ON TEST SET")
print("="*80)


num_samples = len(test_dataset)
print(f"\nEvaluating on {num_samples} samples from test set...")

base_predictions = []
ft_predictions = []
references = []

for i in tqdm(range(num_samples), desc="Generating predictions"):
    example = test_dataset[i]
    
    # Get the prompt and reference
    prompt_text = example["prompt"]
    ref_text = example["chosen"]
    
    # Generate with base model
    base_pred = generate_spotlight(base_model, base_tokenizer, prompt_text)
    
    # Generate with fine-tuned model
    ft_pred = generate_spotlight(model, tokenizer, prompt_text)
    
    # Store results
    references.append(ref_text)
    base_predictions.append(base_pred)
    ft_predictions.append(ft_pred)

print(f"\nGenerated {len(base_predictions)} predictions for each model!")



RUNNING INFERENCE ON TEST SET

Evaluating on 47 samples from test set...


Generating predictions:   0%|                                                           | 0/47 [00:00<?, ?it/s]Generating predictions:   2%|█                                                  | 1/47 [00:18<13:54, 18.14s/it]Generating predictions:   4%|██▏                                                | 2/47 [00:35<13:05, 17.46s/it]Generating predictions:   6%|███▎                                               | 3/47 [00:52<12:38, 17.25s/it]Generating predictions:   9%|████▎                                              | 4/47 [01:09<12:23, 17.28s/it]Generating predictions:  11%|█████▍                                             | 5/47 [01:26<12:01, 17.18s/it]Generating predictions:  13%|██████▌                                            | 6/47 [01:43<11:42, 17.14s/it]Generating predictions:  15%|███████▌                                           | 7/47 [01:55<10:18, 15.47s/it]Generating predictions:  17%|████████▋                                          | 8/47 [02:12<10:27, 16


Generated 47 predictions for each model!





In [19]:
print("\n" + "="*80)
print("CALCULATING ROUGE SCORES")
print("="*80)

from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

def calculate_rouge_scores(predictions, references):
    """Calculate average ROUGE scores"""
    rouge1_scores = []
    rouge2_scores = []
    rougeL_scores = []
    
    for pred, ref in zip(predictions, references):
        scores = scorer.score(ref, pred)
        rouge1_scores.append(scores['rouge1'].fmeasure)
        rouge2_scores.append(scores['rouge2'].fmeasure)
        rougeL_scores.append(scores['rougeL'].fmeasure)
    
    return {
        'rouge1': sum(rouge1_scores) / len(rouge1_scores),
        'rouge2': sum(rouge2_scores) / len(rouge2_scores),
        'rougeL': sum(rougeL_scores) / len(rougeL_scores),
    }

base_rouge = calculate_rouge_scores(base_predictions, references)
ft_rouge = calculate_rouge_scores(ft_predictions, references)

print("\n📊 ROUGE SCORES:")
print("-" * 60)
print(f"{'Metric':<15} {'Base Model':<15} {'Fine-tuned':<15} {'Improvement':<15}")
print("-" * 60)
for metric in ['rouge1', 'rouge2', 'rougeL']:
    base_score = base_rouge[metric]
    ft_score = ft_rouge[metric]
    improvement = ((ft_score - base_score) / base_score) * 100
    print(f"{metric:<15} {base_score:<15.4f} {ft_score:<15.4f} {improvement:+.2f}%")
print("-" * 60)



CALCULATING ROUGE SCORES

📊 ROUGE SCORES:
------------------------------------------------------------
Metric          Base Model      Fine-tuned      Improvement    
------------------------------------------------------------
rouge1          0.1142          0.1167          +2.20%
rouge2          0.0300          0.0285          -4.85%
rougeL          0.0828          0.0844          +1.95%
------------------------------------------------------------


In [20]:
print("\n" + "="*80)
print("CALCULATING BERTSCORE")
print("="*80)

from bert_score import score as bert_score

print("\nCalculating BERTScore for base model...")
P_base, R_base, F1_base = bert_score(
    base_predictions,
    references,
    lang="en",
    verbose=False,
    device="cuda" if torch.cuda.is_available() else "cpu"
)

print("Calculating BERTScore for fine-tuned model...")
P_ft, R_ft, F1_ft = bert_score(
    ft_predictions,
    references,
    lang="en",
    verbose=False,
    device="cuda" if torch.cuda.is_available() else "cpu"
)

base_bertscore = {
    'precision': P_base.mean().item(),
    'recall': R_base.mean().item(),
    'f1': F1_base.mean().item(),
}

ft_bertscore = {
    'precision': P_ft.mean().item(),
    'recall': R_ft.mean().item(),
    'f1': F1_ft.mean().item(),
}

print("\n📊 BERTSCORE:")
print("-" * 60)
print(f"{'Metric':<15} {'Base Model':<15} {'Fine-tuned':<15} {'Improvement':<15}")
print("-" * 60)
for metric in ['precision', 'recall', 'f1']:
    base_score = base_bertscore[metric]
    ft_score = ft_bertscore[metric]
    improvement = ((ft_score - base_score) / base_score) * 100
    print(f"{metric:<15} {base_score:<15.4f} {ft_score:<15.4f} {improvement:+.2f}%")
print("-" * 60)


CALCULATING BERTSCORE

Calculating BERTScore for base model...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Calculating BERTScore for fine-tuned model...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



📊 BERTSCORE:
------------------------------------------------------------
Metric          Base Model      Fine-tuned      Improvement    
------------------------------------------------------------
precision       0.7879          0.7910          +0.39%
recall          0.7759          0.7757          -0.03%
f1              0.7816          0.7830          +0.18%
------------------------------------------------------------


In [21]:

import pandas as pd

# Create results dataframe
results_df = pd.DataFrame({
    'reference': references,
    'base_prediction': base_predictions,
    'ft_prediction': ft_predictions,
})

# Save to CSV
results_df.to_csv('evaluation_results.csv', index=False)
print("\n Results saved to 'evaluation_results.csv'")

# Create summary
summary_data = {
    'Metric': ['ROUGE-1', 'ROUGE-2', 'ROUGE-L', 'BERTScore-P', 'BERTScore-R', 'BERTScore-F1'],
    'Base Model': [
        base_rouge['rouge1'],
        base_rouge['rouge2'],
        base_rouge['rougeL'],
        base_bertscore['precision'],
        base_bertscore['recall'],
        base_bertscore['f1'],
    ],
    'Fine-tuned Model': [
        ft_rouge['rouge1'],
        ft_rouge['rouge2'],
        ft_rouge['rougeL'],
        ft_bertscore['precision'],
        ft_bertscore['recall'],
        ft_bertscore['f1'],
    ],
}

summary_df = pd.DataFrame(summary_data)
summary_df['Improvement (%)'] = ((summary_df['Fine-tuned Model'] - summary_df['Base Model']) / summary_df['Base Model'] * 100).round(2)

print("\n📊 EVALUATION SUMMARY:")
print(summary_df.to_string(index=False))

summary_df.to_csv('evaluation_summary.csv', index=False)
print("\n✅ Summary saved to 'evaluation_summary.csv'")



 Results saved to 'evaluation_results.csv'

📊 EVALUATION SUMMARY:
      Metric  Base Model  Fine-tuned Model  Improvement (%)
     ROUGE-1    0.114179          0.116696             2.20
     ROUGE-2    0.029973          0.028521            -4.85
     ROUGE-L    0.082760          0.084376             1.95
 BERTScore-P    0.787899          0.790958             0.39
 BERTScore-R    0.775877          0.775679            -0.03
BERTScore-F1    0.781605          0.783043             0.18

✅ Summary saved to 'evaluation_summary.csv'


In [22]:
print("\n" + "="*80)
print("SAVING FINE-TUNED MODEL")
print("="*80)

# Save the fine-tuned model in 16-bit format
model.save_pretrained("llama3_spotlight_dpo_lora")
tokenizer.save_pretrained("llama3_spotlight_dpo_lora")

print("✅ Model saved to 'llama3_spotlight_dpo_lora' directory")


SAVING FINE-TUNED MODEL
✅ Model saved to 'llama3_spotlight_dpo_lora' directory


In [None]:
# Push LoRA adapters (lightweight, recommended)
model.push_to_hub(
    "Abhishekkk3/llama3-spotlight-dpo-lora_SCITLD_test2",  # Change to your username
    token="",  # Or leave blank if already logged in
)

tokenizer.push_to_hub(
    "Abhishekkk3/llama3-spotlight-dpo_lora_SCITLD_test2",
    token="",
)

print("✅ LoRA adapters pushed to Hugging Face Hub!")
print("📦 Model: YOUR_USERNAME/llama3-spotlight-dpo-lora_test2")