In [None]:
import re
import torch; v = re.match(r"[0-9\.]{3,}", str(torch.__version__)).group(0)
xformers = "xformers==" + ("0.0.32.post2" if v == "2.8.0" else "0.0.29.post3")
!pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
!pip install --no-deps unsloth
!pip install transformers==4.55.4
!pip install --no-deps trl==0.22.2

In [None]:
import unsloth
from unsloth import PatchDPOTrainer

PatchDPOTrainer()

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 4096
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/zephyr-sft-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth 2025.9.4: Fast Mistral patching. Transformers: 4.55.4.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/4.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/155 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/511 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [None]:
import os
import re
from typing import List, Literal, Optional

from datasets import DatasetDict, concatenate_datasets, load_dataset, load_from_disk
from datasets.builder import DatasetGenerationError


DEFAULT_CHAT_TEMPLATE = "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"


def apply_chat_template(
    example,
    tokenizer,
    task: Literal["sft", "generation", "rm", "dpo"] = "sft",
    assistant_prefix="<|assistant|>\n",
):
    def _strip_prefix(s, pattern):
        # Use re.escape to escape any special characters in the pattern
        return re.sub(f"^{re.escape(pattern)}", "", s)

    if task in ["sft", "generation"]:
        messages = example["messages"]
        # We add an empty system message if there is none
        if messages[0]["role"] != "system":
            messages.insert(0, {"role": "system", "content": ""})
        example["text"] = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True if task == "generation" else False,
        )
    elif task == "rm":
        if all(k in example.keys() for k in ("chosen", "rejected")):
            chosen_messages = example["chosen"]
            rejected_messages = example["rejected"]
            # We add an empty system message if there is none
            if chosen_messages[0]["role"] != "system":
                chosen_messages.insert(0, {"role": "system", "content": ""})
            if rejected_messages[0]["role"] != "system":
                rejected_messages.insert(0, {"role": "system", "content": ""})
            example["text_chosen"] = tokenizer.apply_chat_template(
                chosen_messages, tokenize=False
            )
            example["text_rejected"] = tokenizer.apply_chat_template(
                rejected_messages, tokenize=False
            )
        else:
            raise ValueError(
                f"Could not format example as dialogue for `rm` task! Require `[chosen, rejected]` keys but found {list(example.keys())}"
            )
    elif task == "dpo":
        if all(k in example.keys() for k in ("chosen", "rejected")):
            # Compared to reward modeling, we filter out the prompt, so the text is everything after the last assistant token
            prompt_messages = [
                [msg for msg in example["chosen"] if msg["role"] == "user"][0]
            ]
            # Insert system message
            if example["chosen"][0]["role"] != "system":
                prompt_messages.insert(0, {"role": "system", "content": ""})
            else:
                prompt_messages.insert(0, example["chosen"][0])
            # TODO: handle case where chosen/rejected also have system messages
            chosen_messages = example["chosen"][1:]
            rejected_messages = example["rejected"][1:]
            example["text_chosen"] = tokenizer.apply_chat_template(
                chosen_messages, tokenize=False
            )
            example["text_rejected"] = tokenizer.apply_chat_template(
                rejected_messages, tokenize=False
            )
            example["text_prompt"] = tokenizer.apply_chat_template(
                prompt_messages, tokenize=False, add_generation_prompt=True
            )
            example["text_chosen"] = _strip_prefix(
                example["text_chosen"], assistant_prefix
            )
            example["text_rejected"] = _strip_prefix(
                example["text_rejected"], assistant_prefix
            )
        else:
            raise ValueError(
                f"Could not format example as dialogue for `dpo` task! Require `[chosen, rejected]` keys but found {list(example.keys())}"
            )
    else:
        raise ValueError(
            f"Task {task} not supported, please ensure that the provided task is one of {['sft', 'generation', 'rm', 'dpo']}"
        )
    return example


def get_datasets(
    data_config: dict,
    splits: List[str] = ["train", "test"],
    shuffle: bool = True,
) -> DatasetDict:
    """
    Loads one or more datasets with varying training set proportions.

    Args:
        data_config (`DataArguments` or `dict`):
            Dataset configuration and split proportions.
        splits (`List[str]`, *optional*, defaults to `['train', 'test']`):
            Dataset splits to load and mix. Assumes the splits exist in all datasets and have a `train_` or `test_` prefix.
        shuffle (`bool`, *optional*, defaults to `True`):
            Whether to shuffle the training and testing/validation data.

    Returns
        [`DatasetDict`]: The dataset dictionary containing the loaded datasets.
    """

    if type(data_config) is dict:
        # Structure of the input is:
        #     dataset_mixer = {
        #             "dataset1": 0.5,
        #             "dataset1": 0.3,
        #             "dataset1": 0.2,
        #         }
        dataset_mixer = data_config
    else:
        raise ValueError(f"Data config {data_config} not recognized.")

    raw_datasets = mix_datasets(dataset_mixer, splits=splits, shuffle=shuffle)
    return raw_datasets


def mix_datasets(
    dataset_mixer: dict, splits: Optional[List[str]] = None, shuffle=True
) -> DatasetDict:
    """
    Loads and mixes datasets according to proportions specified in `dataset_mixer`.

    Args:
        dataset_mixer (`dict`):
            Dictionary containing the dataset names and their training proportions. By default, all test proportions are 1.
        splits (Optional[List[str]], *optional*, defaults to `None`):
            Dataset splits to load and mix. Assumes the splits exist in all datasets and have a `train_` or `test_` prefix.
        shuffle (`bool`, *optional*, defaults to `True`):
            Whether to shuffle the training and testing/validation data.
    """
    raw_datasets = DatasetDict()
    raw_train_datasets = []
    raw_val_datasets = []
    fracs = []
    for ds, frac in dataset_mixer.items():
        fracs.append(frac)
        for split in splits:
            try:
                # Try first if dataset on a Hub repo
                dataset = load_dataset(ds, split=split)
            except DatasetGenerationError:
                # If not, check local dataset
                dataset = load_from_disk(os.path.join(ds, split))

            if "train" in split:
                raw_train_datasets.append(dataset)
            elif "test" in split:
                raw_val_datasets.append(dataset)
            else:
                raise ValueError(
                    f"Split type {split} not recognized as one of test or train."
                )

    if any(frac < 0 for frac in fracs):
        raise ValueError("Dataset fractions cannot be negative.")

    if len(raw_train_datasets) > 0:
        train_subsets = []
        for dataset, frac in zip(raw_train_datasets, fracs):
            train_subset = dataset.select(range(int(frac * len(dataset))))
            train_subsets.append(train_subset)
        if shuffle:
            raw_datasets["train"] = concatenate_datasets(train_subsets).shuffle(seed=42)
        else:
            raw_datasets["train"] = concatenate_datasets(train_subsets)
    # No subsampling for test datasets to enable fair comparison across models
    if len(raw_val_datasets) > 0:
        if shuffle:
            raw_datasets["test"] = concatenate_datasets(raw_val_datasets).shuffle(
                seed=42
            )
        else:
            raw_datasets["test"] = concatenate_datasets(raw_val_datasets)

    if len(raw_datasets) == 0:
        raise ValueError(
            f"Dataset {dataset_mixer} not recognized with split {split}. Check the dataset has been correctly formatted."
        )

    return raw_datasets

In [None]:
raw_datasets = get_datasets(
    {"HuggingFaceH4/ultrafeedback_binarized" : 0.005}, # 0.5% sampled
    splits = ["train_prefs", "test_prefs"],
)
column_names = list(raw_datasets["train"].features)

raw_datasets = raw_datasets.map(
    apply_chat_template,
    fn_kwargs = {"tokenizer": tokenizer, "task": "dpo"},
    num_proc = 12,
    remove_columns = column_names,
    desc = "Formatting comparisons with prompt template",
)

# Replace column names with what TRL needs, text_chosen -> chosen and text_rejected -> rejected
for split in ["train", "test"]:
    raw_datasets[split] = raw_datasets[split].rename_columns(
        {"text_prompt": "prompt", "text_chosen": "chosen", "text_rejected": "rejected"}
    )

README.md: 0.00B [00:00, ?B/s]

data/train_prefs-00000-of-00001.parquet:   0%|          | 0.00/226M [00:00<?, ?B/s]

data/test_prefs-00000-of-00001.parquet:   0%|          | 0.00/7.29M [00:00<?, ?B/s]

data/test_sft-00000-of-00001.parquet:   0%|          | 0.00/3.72M [00:00<?, ?B/s]

data/train_gen-00000-of-00001.parquet:   0%|          | 0.00/184M [00:00<?, ?B/s]

data/test_gen-00000-of-00001.parquet:   0%|          | 0.00/3.02M [00:00<?, ?B/s]

Generating train_prefs split:   0%|          | 0/61135 [00:00<?, ? examples/s]

Generating train_sft split:   0%|          | 0/61135 [00:00<?, ? examples/s]

Generating test_prefs split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating test_sft split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating train_gen split:   0%|          | 0/61135 [00:00<?, ? examples/s]

Generating test_gen split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Formatting comparisons with prompt template (num_proc=12):   0%|          | 0/305 [00:00<?, ? examples/s]

Formatting comparisons with prompt template (num_proc=12):   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 64,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 64,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Unsloth 2025.9.4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:
model.print_trainable_parameters()

trainable params: 167,772,160 || all params: 7,409,504,256 || trainable%: 2.2643


In [None]:
from unsloth import PatchDPOTrainer

PatchDPOTrainer()

In [None]:
from transformers import TrainingArguments
from trl import DPOTrainer, DPOConfig
dpo_trainer = DPOTrainer(
    model = model,
    ref_model = None,
    args = DPOConfig(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_ratio = 0.1,
        num_train_epochs = 3,
        learning_rate = 5e-6,
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.0,
        lr_scheduler_type = "linear",
        seed = 42,
        output_dir = "outputs",
        report_to = "none",
    ),
    beta = 0.1,
    train_dataset = raw_datasets["train"],
    # eval_dataset = raw_datasets["test"],
    tokenizer = tokenizer,
    max_length = 1024,
    max_prompt_length = 512,
)

Extracting prompt in train dataset (num_proc=6):   0%|          | 0/305 [00:00<?, ? examples/s]

Applying chat template to train dataset (num_proc=6):   0%|          | 0/305 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=6):   0%|          | 0/305 [00:00<?, ? examples/s]

In [9]:
dpo_trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 305 | Num Epochs = 3 | Total steps = 117
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 167,772,160 of 7,409,504,256 (2.26% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / chosen,logps / rejected,logits / chosen,logits / rejected,eval_logits / chosen,eval_logits / rejected,nll_loss
1,0.6931,0.0,0.0,0.0,0.0,-181.23053,-195.662674,-2.371401,-2.58547,0,0,0
2,0.6931,0.0,0.0,0.0,0.0,-282.245056,-377.400726,-2.836131,-2.922761,No Log,No Log,No Log
3,0.6931,0.0,0.0,0.0,0.0,-319.422821,-232.848312,-2.783862,-2.940019,No Log,No Log,No Log
4,0.6932,-0.000198,-0.000112,0.5,-8.5e-05,-228.858322,-290.574341,-2.777873,-2.856565,No Log,No Log,No Log
5,0.6946,-0.004169,-0.001329,0.25,-0.002841,-232.954086,-239.218445,-2.917544,-2.643948,No Log,No Log,No Log
6,0.6908,0.003763,-0.000911,0.625,0.004674,-418.959595,-378.518127,-2.078197,-2.589293,No Log,No Log,No Log
7,0.6931,-0.003744,-0.003945,0.375,0.0002,-543.7146,-441.002197,-2.996009,-2.95872,No Log,No Log,No Log
8,0.6858,-0.001403,-0.016435,0.75,0.015032,-300.7789,-270.482483,-2.792776,-3.01326,No Log,No Log,No Log
9,0.673,-0.000694,-0.041845,0.75,0.041151,-169.909668,-327.924744,-3.004963,-3.031399,No Log,No Log,No Log
10,0.6797,-0.030462,-0.057705,0.875,0.027243,-510.957825,-457.702087,-2.635879,-2.828956,No Log,No Log,No Log


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / chosen,logps / rejected,logits / chosen,logits / rejected,eval_logits / chosen,eval_logits / rejected,nll_loss
1,0.6931,0.0,0.0,0.0,0.0,-181.23053,-195.662674,-2.371401,-2.58547,0,0,0
2,0.6931,0.0,0.0,0.0,0.0,-282.245056,-377.400726,-2.836131,-2.922761,No Log,No Log,No Log
3,0.6931,0.0,0.0,0.0,0.0,-319.422821,-232.848312,-2.783862,-2.940019,No Log,No Log,No Log
4,0.6932,-0.000198,-0.000112,0.5,-8.5e-05,-228.858322,-290.574341,-2.777873,-2.856565,No Log,No Log,No Log
5,0.6946,-0.004169,-0.001329,0.25,-0.002841,-232.954086,-239.218445,-2.917544,-2.643948,No Log,No Log,No Log
6,0.6908,0.003763,-0.000911,0.625,0.004674,-418.959595,-378.518127,-2.078197,-2.589293,No Log,No Log,No Log
7,0.6931,-0.003744,-0.003945,0.375,0.0002,-543.7146,-441.002197,-2.996009,-2.95872,No Log,No Log,No Log
8,0.6858,-0.001403,-0.016435,0.75,0.015032,-300.7789,-270.482483,-2.792776,-3.01326,No Log,No Log,No Log
9,0.673,-0.000694,-0.041845,0.75,0.041151,-169.909668,-327.924744,-3.004963,-3.031399,No Log,No Log,No Log
10,0.6797,-0.030462,-0.057705,0.875,0.027243,-510.957825,-457.702087,-2.635879,-2.828956,No Log,No Log,No Log


TrainOutput(global_step=117, training_loss=0.24932863431063312, metrics={'train_runtime': 4271.1922, 'train_samples_per_second': 0.214, 'train_steps_per_second': 0.027, 'total_flos': 0.0, 'train_loss': 0.24932863431063312, 'epoch': 3.0})

In [10]:
model.save_pretrained("dpo_zephyr_model")
tokenizer.save_pretrained("dpo_zephyr_model")

('dpo_zephyr_model/tokenizer_config.json',
 'dpo_zephyr_model/special_tokens_map.json',
 'dpo_zephyr_model/chat_template.jinja',
 'dpo_zephyr_model/tokenizer.model',
 'dpo_zephyr_model/added_tokens.json',
 'dpo_zephyr_model/tokenizer.json')

In [12]:
FastLanguageModel.for_inference(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32000, 4096, padding_idx=0)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): l

In [14]:
prompt = """<|user|>
Explain quantum computing in simple terms.
<|assistant|>
"""

inputs = tokenizer([prompt], return_tensors="pt").to("cuda")

with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=256,
        temperature=0.7,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
    )

response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
print("🤖 DPO Trained Response:")
print(response[len(prompt):])

🤖 DPO Trained Response:
Quantum computing is a new type of computing that uses the principles of quantum mechanics to perform calculations. Unlike traditional computers that use bits (0s and 1s) to process information, quantum computers use quantum bits, or qubits, which can exist in a superposition of 0 and 1 at the same time.

This unique property of qubits allows quantum computers to perform certain calculations much faster than classical computers. For example, they can efficiently factor large numbers, which is a crucial step in many encryption algorithms.

In simple terms, a quantum computer works by manipulating qubits using quantum gates, similar to how classical computers manipulate bits using logic gates. However, quantum gates are much more complex due to the superposition property of qubits.

Quantum computing is still in its early stages of development, and current quantum computers can only perform limited operations due to the challenges of controlling and measuring qubi

In [15]:
# Test different types of prompts
test_prompts = [
    "<|user|>\nWhat's the difference between AI and machine learning?\n<|assistant|>\n",
    "<|user|>\nExplain photosynthesis briefly.\n<|assistant|>\n",
    "<|user|>\nHow do I make a good cup of coffee?\n<|assistant|>\n"
]

for i, prompt in enumerate(test_prompts, 1):
    inputs = tokenizer([prompt], return_tensors="pt").to("cuda")

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=150,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
        )

    response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    print(f"\n🧪 Test {i}:")
    print(f"Q: {prompt.split('<|user|>')[1].split('<|assistant|>')[0].strip()}")
    print(f"A: {response[len(prompt):]}")
    print("-" * 50)


🧪 Test 1:
Q: What's the difference between AI and machine learning?
A: Artificial Intelligence (AI) and Machine Learning (ML) are related but distinct fields in computer science. Here's how they differ:

1. AI is a broader concept, while ML is a subfield of AI: AI refers to the ability of a machine to perform tasks that typically require human intelligence, such as visual perception, decision-making, and language translation. Machine Learning is a subfield of AI that focuses on teaching computers to learn and improve from experience without being explicitly programmed.

2. AI encompasses both ML and non-ML techniques: While ML is a critical component of AI, it is not the only way to achieve AI. Some AI applications do not require learning algorithms, such as those that
--------------------------------------------------

🧪 Test 2:
Q: Explain photosynthesis briefly.
A: Photosynthesis is a biological process by which plant cells, as well as some other organisms like algae and bacteria, c