In [None]:
!pip install datasets transformers bitsandbytes peft accelerate trl flash-attn --no-build-isolation

In [None]:
import os, sys
import re
import logging
from copy import deepcopy
import numpy as np
from tqdm import tqdm
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from dataclasses import dataclass
from huggingface_hub import list_repo_files
import datasets, transformers
from accelerate import Accelerator
from datasets import DatasetDict, concatenate_datasets, load_dataset, load_from_disk
from peft import LoraConfig, PeftConfig, PeftModel
from trl import SFTTrainer, DPOTrainer
from transformers import (
    AutoTokenizer, BitsAndBytesConfig, PreTrainedTokenizer,
    AutoModelForCausalLM, TrainingArguments
)

accelerator = Accelerator()
logger = logging.getLogger()

[Zephyr Training Repo](https://github.com/huggingface/alignment-handbook/tree/main)

# 1) Supervised Training

## 1.1 Dataset Loading

In [None]:
def mix_datasets(d_config, splits, shuffle):
    raw_datasets = DatasetDict()
    raw_train_datasets = []
    raw_val_datasets = []
    fracs = []

    # combine fraction of multiple data together
    for ds, frac in d_config.items():
        fracs.append(frac)
        for split in splits:
            dataset = load_dataset(ds, split=split)
            try:
                dataset = load_dataset(ds, split=split)
            except:
                dataset = load_from_disk(os.path.join(ds, split))

            if "train" in split:
                raw_train_datasets.append(dataset)
            elif "test" in split:
                raw_val_datasets.append(dataset)
            else:
                raise ValueError(f"Split type {split} not recognized as one of test or train.")

    if len(raw_train_datasets) > 0:
        train_subsets = []
        for dataset, frac in zip(raw_train_datasets, fracs):
            train_subset = dataset.select(range(int(frac * len(dataset))))
            train_subsets.append(train_subset)

        if shuffle:
            raw_datasets["train"] = concatenate_datasets(train_subsets).shuffle(seed=42)
        else:
            raw_datasets["train"] = concatenate_datasets(train_subsets)

    if len(raw_val_datasets) > 0:
        if shuffle:
            raw_datasets["test"] = concatenate_datasets(raw_val_datasets).shuffle(seed=42)
        else:
            raw_datasets["test"] = concatenate_datasets(raw_val_datasets)

    return raw_datasets

In [None]:
dataset_config = {
    "HuggingFaceH4/ultrachat_200k": 0.1
}
raw_datasets = mix_datasets(dataset_config, splits=['train_sft', 'test_sft'], shuffle=False)
print(raw_datasets)

for msg in raw_datasets['test'][0]['messages']:
    print("\n")
    print(msg)

DatasetDict({
    train: Dataset({
        features: ['prompt', 'prompt_id', 'messages'],
        num_rows: 20786
    })
    test: Dataset({
        features: ['prompt', 'prompt_id', 'messages'],
        num_rows: 23110
    })
})


{'content': 'How does the author propose to fix the problem of science alienation in our educational system? What changes does she suggest should be made to science education? Answer according to: Science education should be split into two tracks.\nSplit K-12 science education into two tracks, for majors and nonmajors.\nThose who want to specialize in science could take math and complex chemistry. Nonmajors would focus on science of the everyday—things like kitchen chemistry and CSI-style crime investigations.\nSome years ago, when I was working as a newspaper science writer in California, I fell into a rather idle conversation with a physicist on the subject of science education. Idle for him, at least, because what he said—the way he defined the American s

## 1.2 Tokenization

In [None]:
DEFAULT_CHAT_TEMPLATE = "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"

def get_tokenizer(model_name_or_path, truncation_side=None):
    """Get the tokenizer for the model."""
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, revision='main')
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token_id = tokenizer.eos_token_id

    if truncation_side:
        tokenizer.truncation_side = truncation_side

    # Set reasonable default for models without max length
    if tokenizer.model_max_length > 100_000:
        tokenizer.model_max_length = 2048

    tokenizer.chat_template = DEFAULT_CHAT_TEMPLATE
    return tokenizer

tokenizer = get_tokenizer('mistralai/Mistral-7B-v0.1')

In [None]:
def apply_chat_template(example, tokenizer, task="sft", assistant_prefix="<|assistant|>\n"):
    def _strip_prefix(s, pattern):
        # Use re.escape to escape any special characters in the pattern
        return re.sub(f"^{re.escape(pattern)}", "", s)

    if task in ["sft", "generation"]:
        messages = example["messages"]

        # add an empty system message if there is none
        if messages[0]["role"] != "system":
            messages.insert(0, {"role": "system", "content": ""})
        example["text"] = tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True if task == "generation" else False)

    elif task == "rm":
        if all(k in example.keys() for k in ("chosen", "rejected")):
            chosen_messages = example["chosen"]
            rejected_messages = example["rejected"]

            # add an empty system message if there is none
            if chosen_messages[0]["role"] != "system":
                chosen_messages.insert(0, {"role": "system", "content": ""})
            if rejected_messages[0]["role"] != "system":
                rejected_messages.insert(0, {"role": "system", "content": ""})
            example["text_chosen"] = tokenizer.apply_chat_template(chosen_messages, tokenize=False)
            example["text_rejected"] = tokenizer.apply_chat_template(rejected_messages, tokenize=False)
        else:
            raise ValueError(f"Require `[chosen, rejected]` keys but found {list(example.keys())}")

    elif task == "dpo":
        if all(k in example.keys() for k in ("chosen", "rejected")):
            # Compared to reward modeling, filter out the prompt
            prompt_messages = [[msg for msg in example["chosen"] if msg["role"] == "user"][0]]

            # Insert system message
            if example["chosen"][0]["role"] != "system":
                prompt_messages.insert(0, {"role": "system", "content": ""})
            else:
                prompt_messages.insert(0, example["chosen"][0])

            # TODO: handle case where chosen/rejected also have system messages
            chosen_messages = example["chosen"][1:]
            rejected_messages = example["rejected"][1:]
            example["text_chosen"] = tokenizer.apply_chat_template(chosen_messages, tokenize=False)
            example["text_rejected"] = tokenizer.apply_chat_template(rejected_messages, tokenize=False)
            example["text_prompt"] = tokenizer.apply_chat_template(
                prompt_messages, tokenize=False, add_generation_prompt=True
            )
            example["text_chosen"] = _strip_prefix(example["text_chosen"], assistant_prefix)
            example["text_rejected"] = _strip_prefix(example["text_rejected"], assistant_prefix)
        else:
            raise ValueError(f"Require `[chosen, rejected]` keys but found {list(example.keys())}")
    else:
        raise ValueError(f"Ensure provided task is one of {['sft', 'generation', 'rm', 'dpo']}")

    return example

raw_datasets = raw_datasets.map(
    apply_chat_template, fn_kwargs={"tokenizer": tokenizer, "task": "sft"}
)

print('\n')
train_dataset = raw_datasets["train"]
eval_dataset = raw_datasets["test"]
print(train_dataset[1]['text'])



<|system|>
</s>
<|user|>
Which famous landmarks should I visit in London, beyond the usual ones?</s>
<|assistant|>
1. Leadenhall Market - a beautiful indoor market with stunning Victorian architecture, also used as a filming location in the Harry Potter films.

2. St. Dunstan in the East - a ruined church in the middle of the city that has been turned into a beautiful public garden.

3. The Monument - a 202-foot-tall column commemorating the Great Fire of London, with a staircase leading to a viewing platform offering great views of the city.

4. The Camden Town Markets - an eclectic collection of markets offering food, fashion, and vintage items, plus live music and street performers.

5. Novelist's House - the former home of Charles Dickens, now a museum dedicated to his life and works.

6. The Old Operating Theatre - a museum housed in the oldest surviving operating theatre in Europe, with exhibits on the history of surgery and medical practices.

7. The Churchill War Rooms - an u

## 1.3 Quantization

In [None]:
def get_quantization_config(load_in_4bit=False, load_in_8bit=False):

    if load_in_4bit:
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_quant_type='nf4', # (fp4 or nf4)
            bnb_4bit_use_double_quant=False)

    elif load_in_8bit:
        quantization_config = BitsAndBytesConfig(load_in_8bit=True)

    else:
        quantization_config = None

    return quantization_config

# "choices": ["auto", "bfloat16", "float16", "float32"]
torch_dtype = "bfloat16"
torch_dtype = torch_dtype if torch_dtype in ["auto", None] else getattr(torch, torch_dtype)
quantization_config = get_quantization_config(load_in_4bit=True)

## 1.4 Model Loading

In [None]:
def get_current_device():
    return Accelerator().local_process_index if torch.cuda.is_available() else "cpu"

def get_kbit_device_map():
    return {"": get_current_device()} if torch.cuda.is_available() else None

model_kwargs = dict(
    revision='main',
    trust_remote_code=False,
    use_flash_attention_2=False,
    torch_dtype=torch_dtype,
    use_cache=False,
    device_map=get_kbit_device_map() if quantization_config is not None else None,
    quantization_config=quantization_config,
)

model = AutoModelForCausalLM.from_pretrained(
    'mistralai/Mistral-7B-v0.1',
    **model_kwargs
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## 1.5 Model Training

In [None]:
def get_peft_config(
        use_peft, lora_r, lora_alpha, lora_dropout,
        lora_target_modules, lora_modules_to_save
    ):

    if use_peft is False:
        return None

    peft_config = LoraConfig(
        r=lora_r,
        lora_alpha=lora_alpha,
        lora_dropout=lora_dropout,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=lora_target_modules,
        modules_to_save=lora_modules_to_save,
    )
    return peft_config

lora_args = dict(
    use_peft=True,
    lora_r=4,
    lora_alpha=16,
    lora_dropout=0.1,
    lora_target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj'],
    lora_modules_to_save=None
)

peft_config = get_peft_config(**lora_args)

In [None]:
# gradient_checkpointing use significantly memory with small decrease in training speed
training_args = TrainingArguments(
    bf16=False,
    evaluation_strategy='epoch',
    gradient_accumulation_steps=128,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={'use_reentrant': False},
    learning_rate=2.0e-05,
    log_level='info',
    logging_steps=5,
    logging_strategy='steps',
    lr_scheduler_type='cosine',
    max_steps=-1,
    num_train_epochs=1,
    output_dir='data/zephyr-7b-sft-lora',
    overwrite_output_dir=True,
    per_device_eval_batch_size=8,
    per_device_train_batch_size=2,
    save_strategy="no",
    seed=42,
)

trainer = SFTTrainer(
    model=model,
    # model_init_kwargs=model_kwargs,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    dataset_text_field="text",
    max_seq_length=512,
    tokenizer=tokenizer,
    packing=True,
    peft_config=peft_config,
)



In [None]:
train_result = trainer.train()
metrics = train_result.metrics
max_train_samples = 2000
metrics["train_samples"] = min(max_train_samples, len(train_dataset))
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

***** Running training *****
  Num examples = 20,786
  Num Epochs = 1
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 256
  Gradient Accumulation steps = 128
  Total optimization steps = 81
  Number of trainable parameters = 3,407,872
Token indices sequence length is longer than the specified maximum sequence length for this model (3400 > 2048). Running this sequence through the model will result in indexing errors
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


## 1.6 Evaluation and Model Saving

In [None]:
metrics = trainer.evaluate()
max_eval_samples = 2000
metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

In [None]:
trainer.save_model(training_args.output_dir)
logger.info(f"Model saved to {training_args.output_dir}")

In [None]:
if accelerator.is_main_process:
    kwargs = {
        "finetuned_from": 'mistralai/Mistral-7B-v0.1',
        "dataset": list(dataset_config.keys()),
        "dataset_tags": list(dataset_config.keys()),
        "tags": ["alignment-handbook"],
    }
    trainer.create_model_card(**kwargs)

    # Restore k,v cache for fast inference
    trainer.model.config.use_cache = True
    trainer.model.config.save_pretrained(training_args.output_dir)

accelerator.wait_for_everyone()

# 2) DPO

## 2.1 Dataset Loading

In [None]:
model_name = 'alignment-handbook/zephyr-7b-sft-lora'
use_peft = True

In [None]:
def mix_datasets(d_config, splits, shuffle):
    raw_datasets = DatasetDict()
    raw_train_datasets = []
    raw_val_datasets = []
    fracs = []

    # combine fraction of multiple data together
    for ds, frac in d_config.items():
        fracs.append(frac)
        for split in splits:
            dataset = load_dataset(ds, split=split)
            try:
                dataset = load_dataset(ds, split=split)
            except:
                dataset = load_from_disk(os.path.join(ds, split))

            if "train" in split:
                raw_train_datasets.append(dataset)
            elif "test" in split:
                raw_val_datasets.append(dataset)
            else:
                raise ValueError(f"Split type {split} not recognized as one of test or train.")

    if len(raw_train_datasets) > 0:
        train_subsets = []
        for dataset, frac in zip(raw_train_datasets, fracs):
            train_subset = dataset.select(range(int(frac * len(dataset))))
            train_subsets.append(train_subset)

        if shuffle:
            raw_datasets["train"] = concatenate_datasets(train_subsets).shuffle(seed=42)
        else:
            raw_datasets["train"] = concatenate_datasets(train_subsets)

    if len(raw_val_datasets) > 0:
        if shuffle:
            raw_datasets["test"] = concatenate_datasets(raw_val_datasets).shuffle(seed=42)
        else:
            raw_datasets["test"] = concatenate_datasets(raw_val_datasets)

    return raw_datasets

In [None]:
dataset_config = {
    "HuggingFaceH4/ultrafeedback_binarized": 0.001
}
raw_datasets = mix_datasets(dataset_config, splits=['train_prefs', 'test_prefs'], shuffle=False)
column_names = list(raw_datasets["train"].features)
print(raw_datasets)
print(column_names)

idx = 200
print(raw_datasets['test'][idx]['prompt'])
print(raw_datasets['test'][idx]['chosen'][1:])
print(raw_datasets['test'][idx]['rejected'][1:])
print(raw_datasets['test'][idx]['score_chosen'])
print(raw_datasets['test'][idx]['score_rejected'])

DatasetDict({
    train: Dataset({
        features: ['prompt', 'prompt_id', 'chosen', 'rejected', 'messages', 'score_chosen', 'score_rejected'],
        num_rows: 1239
    })
    test: Dataset({
        features: ['prompt', 'prompt_id', 'chosen', 'rejected', 'messages', 'score_chosen', 'score_rejected'],
        num_rows: 2000
    })
})
['prompt', 'prompt_id', 'chosen', 'rejected', 'messages', 'score_chosen', 'score_rejected']
Please answer the following question: Here's a logic test: Alan was playing with a ball. He noticed that when he rolled the ball across the tile floor, it moved easier than it did on the carpet. This is because the carpet is (A) rougher (B) smoother.  Choose the answer between "tile floor" and "carpet".
A:
[{'content': 'carpet', 'role': 'assistant'}]
[{'content': 'B:', 'role': 'assistant'}]
7.0
2.0


## 1.2 Tokenization

In [None]:
DEFAULT_CHAT_TEMPLATE = "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"

def get_tokenizer(model_name_or_path, truncation_side=None):
    """Get the tokenizer for the model."""
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, revision='main')
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token_id = tokenizer.eos_token_id

    if truncation_side:
        tokenizer.truncation_side = truncation_side

    # Set reasonable default for models without max length
    if tokenizer.model_max_length > 100_000:
        tokenizer.model_max_length = 2048

    tokenizer.chat_template = DEFAULT_CHAT_TEMPLATE
    return tokenizer

tokenizer = get_tokenizer(model_name, truncation_side='left')

tokenizer_config.json:   0%|          | 0.00/1.34k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/437 [00:00<?, ?B/s]

In [None]:
def apply_chat_template(example, tokenizer, task="sft", assistant_prefix="<|assistant|>\n"):
    def _strip_prefix(s, pattern):
        # Use re.escape to escape any special characters in the pattern
        return re.sub(f"^{re.escape(pattern)}", "", s)

    if task in ["sft", "generation"]:
        messages = example["messages"]

        # add an empty system message if there is none
        if messages[0]["role"] != "system":
            messages.insert(0, {"role": "system", "content": ""})
        example["text"] = tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True if task == "generation" else False)

    elif task == "rm":
        if all(k in example.keys() for k in ("chosen", "rejected")):
            chosen_messages = example["chosen"]
            rejected_messages = example["rejected"]

            # add an empty system message if there is none
            if chosen_messages[0]["role"] != "system":
                chosen_messages.insert(0, {"role": "system", "content": ""})
            if rejected_messages[0]["role"] != "system":
                rejected_messages.insert(0, {"role": "system", "content": ""})
            example["text_chosen"] = tokenizer.apply_chat_template(chosen_messages, tokenize=False)
            example["text_rejected"] = tokenizer.apply_chat_template(rejected_messages, tokenize=False)
        else:
            raise ValueError(f"Require `[chosen, rejected]` keys but found {list(example.keys())}")

    elif task == "dpo":
        if all(k in example.keys() for k in ("chosen", "rejected")):
            # compared to reward modeling, filter out the prompt
            prompt_messages = [[msg for msg in example["chosen"] if msg["role"] == "user"][0]]

            # Insert system message
            if example["chosen"][0]["role"] != "system":
                prompt_messages.insert(0, {"role": "system", "content": ""})
            else:
                prompt_messages.insert(0, example["chosen"][0])

            # TODO: handle case where chosen/rejected also have system messages
            chosen_messages = example["chosen"][1:]
            rejected_messages = example["rejected"][1:]
            example["text_chosen"] = tokenizer.apply_chat_template(chosen_messages, tokenize=False)
            example["text_rejected"] = tokenizer.apply_chat_template(rejected_messages, tokenize=False)
            example["text_prompt"] = tokenizer.apply_chat_template(
                prompt_messages, tokenize=False, add_generation_prompt=True
            )
            example["text_chosen"] = _strip_prefix(example["text_chosen"], assistant_prefix)
            example["text_rejected"] = _strip_prefix(example["text_rejected"], assistant_prefix)
        else:
            raise ValueError(f"Require `[chosen, rejected]` keys but found {list(example.keys())}")
    else:
        raise ValueError(f"Ensure provided task is one of {['sft', 'generation', 'rm', 'dpo']}")

    return example

raw_datasets = raw_datasets.map(
    apply_chat_template,
    fn_kwargs={"tokenizer": tokenizer, "task": "dpo"},
    num_proc=12,
    remove_columns=column_names,
    desc="Formatting comparisons with prompt template",
)

for split in ["train", "test"]:
    raw_datasets[split] = raw_datasets[split].rename_columns(
        {"text_prompt": "prompt", "text_chosen": "chosen", "text_rejected": "rejected"}
    )

print('\n')
train_dataset = raw_datasets["train"]
eval_dataset = raw_datasets["test"]

idx = 1010
print("Prompt: ", train_dataset[idx]['prompt'])
print("Choose Example: ", train_dataset[idx]['chosen'])
print("Rejected Example: ",train_dataset[idx]['rejected'])

Formatting comparisons with prompt template (num_proc=12):   0%|          | 0/1239 [00:00<?, ? examples/s]

Formatting comparisons with prompt template (num_proc=12):   0%|          | 0/2000 [00:00<?, ? examples/s]



Prompt:  <|system|>
</s>
<|user|>
Can you summarize the features and specifications of the Double Head Nibble Metal Cutting Sheet Nibbler Saw Cutter 360 Degree Adjustable Drill Attachment Power Tool Accessories Cutting Tools - intl?: Recent double head nibble metal cutting sheet nibbler saw cutter 360 degree adjustable drill attachment power tool accessories cutting tools intl, the product is a popular item this coming year. the product is really a new item sold by Ttech store and shipped from China. Double Head Nibble Metal Cutting Sheet Nibbler Saw Cutter 360 Degree Adjustable Drill Attachment Power Tool Accessories Cutting Tools - intl is sold at lazada.sg having a inexpensive cost of SGD16.34 (This price was taken on 15 June 2018, please check the latest price here). what are features and specifications this Double Head Nibble Metal Cutting Sheet Nibbler Saw Cutter 360 Degree Adjustable Drill Attachment Power Tool Accessories Cutting Tools - intl, let's see the facts below.
Strai

## 1.3 Quantization

In [None]:
def get_quantization_config(load_in_4bit=False, load_in_8bit=False):

    if load_in_4bit:
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_quant_type='nf4', # (fp4 or nf4)
            bnb_4bit_use_double_quant=False)

    elif load_in_8bit:
        quantization_config = BitsAndBytesConfig(load_in_8bit=True)

    else:
        quantization_config = None

    return quantization_config

# "choices": ["auto", "bfloat16", "float16", "float32"]
torch_dtype = "auto"
torch_dtype = torch_dtype if torch_dtype in ["auto", None] else getattr(torch, torch_dtype)
quantization_config = get_quantization_config(load_in_4bit=True)

## 1.4 Model Loading

In [None]:
def get_current_device():
    return Accelerator().local_process_index if torch.cuda.is_available() else "cpu"

def get_kbit_device_map():
    return {"": get_current_device()} if torch.cuda.is_available() else None

model_kwargs = dict(
    revision='main',
    trust_remote_code=False,
    use_flash_attention_2=False,
    torch_dtype=torch_dtype,
    use_cache=False,
    device_map=get_kbit_device_map() if quantization_config is not None else None,
    quantization_config=quantization_config,
)

In [None]:
# Note: to run QLora, you will need to merge the based model separately as the merged model in 16bit
# load the model, merge the adapter weights and unload the adapter

def is_adapter_model(model_name_or_path: str, revision: str = "main") -> bool:
    try:
        repo_files = list_repo_files(model_name_or_path, revision=revision)
    except:
        repo_files = os.listdir(model_name_or_path)
    return "adapter_model.safetensors" in repo_files or "adapter_model.bin" in repo_files

if is_adapter_model(model_name, 'main'):
    peft_config = PeftConfig.from_pretrained(model_name, revision='main')
    base_model = AutoModelForCausalLM.from_pretrained(
        peft_config.base_model_name_or_path,
        **model_kwargs,
    )
    model = PeftModel.from_pretrained(base_model, model_name, revision='main')
    model.eval()
    model = model.merge_and_unload()
    model_kwargs = None

ref_model = model
ref_model_kwargs = model_kwargs

# if model is peft, adapters are turned off will be used as the reference model
if use_peft is True:
    ref_model = None
    ref_model_kwargs = None

## 1.5 Model Training

In [None]:
def get_peft_config(
        use_peft, lora_r, lora_alpha, lora_dropout,
        lora_target_modules, lora_modules_to_save
    ):

    if use_peft is False:
        return None

    peft_config = LoraConfig(
        r=lora_r,
        lora_alpha=lora_alpha,
        lora_dropout=lora_dropout,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=lora_target_modules,
        modules_to_save=lora_modules_to_save,
    )
    return peft_config

lora_args = dict(
    use_peft=use_peft,
    lora_r=16,
    lora_alpha=16,
    lora_dropout=0.1,
    lora_target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj'],
    lora_modules_to_save=None
)

peft_config = get_peft_config(**lora_args)
model.enable_input_require_grads()

In [None]:
training_args = TrainingArguments(
    bf16=False,
    evaluation_strategy='epoch',
    eval_steps=100,
    gradient_accumulation_steps=32,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={'use_reentrant': False},
    learning_rate=5.0e-7,
    log_level='info',
    logging_steps=10,
    lr_scheduler_type='linear',
    num_train_epochs=1,
    optim='rmsprop',
    output_dir='data/zephyr-7b-dpo-lora',
    per_device_eval_batch_size=4,
    per_device_train_batch_size=2,
    save_strategy="no",
    seed=42,
    warmup_ratio=0.1,
)

dpo_trainer = DPOTrainer(
    model,
    ref_model,
    model_init_kwargs=model_kwargs,
    ref_model_init_kwargs=ref_model_kwargs,
    args=training_args,
    beta=0.1,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    max_length=1024,
    max_prompt_length=None,
    peft_config=peft_config,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
train_result = dpo_trainer.train()
metrics = train_result.metrics
max_train_samples = 2000
metrics["train_samples"] = min(max_train_samples, len(raw_datasets["train"]))
dpo_trainer.log_metrics("train", metrics)
dpo_trainer.save_metrics("train", metrics)
dpo_trainer.save_state()

## 1.6 Evaluation

In [None]:
logger.info("*** Evaluate ***")
metrics = dpo_trainer.evaluate()
max_eval_samples = 200
metrics["eval_samples"] = min(max_eval_samples, len(raw_datasets["test"]))
dpo_trainer.log_metrics("eval", metrics)
dpo_trainer.save_metrics("eval", metrics)

In [None]:
dpo_trainer.save_model(training_args.output_dir)
# Save everything else on main process
if accelerator.is_main_process:
    kwargs = {
        "finetuned_from": model_name,
        "dataset": list(dataset_config.keys()),
        "dataset_tags": list(dataset_config.keys()),
        "tags": ["alignment-handbook"],
    }
    dpo_trainer.create_model_card(**kwargs)

    # Restore k,v cache for fast inference
    dpo_trainer.model.config.use_cache = True
    dpo_trainer.model.config.save_pretrained(training_args.output_dir)

# Ensure we don't timeout on model save / push to Hub
accelerator.wait_for_everyone()

Saving model checkpoint to data/zephyr-7b-dpo-lora
tokenizer config file saved in data/zephyr-7b-dpo-lora/tokenizer_config.json
Special tokens file saved in data/zephyr-7b-dpo-lora/special_tokens_map.json
Dropping the following result as it does not have all the necessary fields:
{'dataset': {'name': 'HuggingFaceH4/ultrafeedback_binarized', 'type': 'HuggingFaceH4/ultrafeedback_binarized', 'config': None, 'split': 'None'}}
Configuration saved in data/zephyr-7b-dpo-lora/config.json


# 3) DPO Implementation from scratch

[Original Code](https://github.com/huggingface/trl/blob/main/trl/trainer/dpo_trainer.py#L194)

## 3.1 Data Loading

Follow the dataset at Chapter 2

In [None]:
model_name = 'microsoft/phi-2'
accelerator = Accelerator()

In [None]:
def mix_datasets(d_config, splits, shuffle):
    raw_datasets = DatasetDict()
    raw_train_datasets = []
    raw_val_datasets = []
    fracs = []

    # combine fraction of multiple data together
    for ds, frac in d_config.items():
        fracs.append(frac)
        for split in splits:
            dataset = load_dataset(ds, split=split)
            try:
                dataset = load_dataset(ds, split=split)
            except:
                dataset = load_from_disk(os.path.join(ds, split))

            if "train" in split:
                raw_train_datasets.append(dataset)
            elif "test" in split:
                raw_val_datasets.append(dataset)
            else:
                raise ValueError(f"Split type {split} not recognized as one of test or train.")

    if len(raw_train_datasets) > 0:
        train_subsets = []
        for dataset, frac in zip(raw_train_datasets, fracs):
            train_subset = dataset.select(range(int(frac * len(dataset))))
            train_subsets.append(train_subset)

        if shuffle:
            raw_datasets["train"] = concatenate_datasets(train_subsets).shuffle(seed=42)
        else:
            raw_datasets["train"] = concatenate_datasets(train_subsets)

    if len(raw_val_datasets) > 0:
        if shuffle:
            raw_datasets["test"] = concatenate_datasets(raw_val_datasets).shuffle(seed=42)
        else:
            raw_datasets["test"] = concatenate_datasets(raw_val_datasets)

    return raw_datasets

# loading data from huggingface
dataset_config = {
    "HuggingFaceH4/ultrafeedback_binarized": 0.0002
}
raw_datasets = mix_datasets(dataset_config, splits=['train_prefs', 'test_prefs'], shuffle=False)
column_names = list(raw_datasets["train"].features)
print(raw_datasets)
print(column_names)

idx = 200
print(raw_datasets['test'][idx]['prompt'])
print(raw_datasets['test'][idx]['chosen'][1:])
print(raw_datasets['test'][idx]['rejected'][1:])
print(raw_datasets['test'][idx]['score_chosen'])
print(raw_datasets['test'][idx]['score_rejected'])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['prompt', 'prompt_id', 'chosen', 'rejected', 'messages', 'score_chosen', 'score_rejected'],
        num_rows: 12
    })
    test: Dataset({
        features: ['prompt', 'prompt_id', 'chosen', 'rejected', 'messages', 'score_chosen', 'score_rejected'],
        num_rows: 2000
    })
})
['prompt', 'prompt_id', 'chosen', 'rejected', 'messages', 'score_chosen', 'score_rejected']
Please answer the following question: Here's a logic test: Alan was playing with a ball. He noticed that when he rolled the ball across the tile floor, it moved easier than it did on the carpet. This is because the carpet is (A) rougher (B) smoother.  Choose the answer between "tile floor" and "carpet".
A:
[{'content': 'carpet', 'role': 'assistant'}]
[{'content': 'B:', 'role': 'assistant'}]
7.0
2.0


## 3.2 Data Processing

In [None]:
DEFAULT_CHAT_TEMPLATE = "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"

def get_tokenizer(model_name_or_path, truncation_side=None):
    """Get the tokenizer for the model."""
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, revision='main')
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token_id = tokenizer.eos_token_id

    if truncation_side:
        tokenizer.truncation_side = truncation_side

    # set reasonable default for models without max length
    if tokenizer.model_max_length > 100_000:
        tokenizer.model_max_length = 2048

    tokenizer.chat_template = DEFAULT_CHAT_TEMPLATE
    return tokenizer

tokenizer = get_tokenizer(model_name, truncation_side='left')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
def apply_chat_template(example, tokenizer, task="sft", assistant_prefix="<|assistant|>\n"):
    def _strip_prefix(s, pattern):
        # Use re.escape to escape any special characters in the pattern
        return re.sub(f"^{re.escape(pattern)}", "", s)

    if all(k in example.keys() for k in ("chosen", "rejected")):
        # compared to reward modeling, filter out the prompt
        prompt_messages = [[msg for msg in example["chosen"] if msg["role"] == "user"][0]]

        # Insert system message
        if example["chosen"][0]["role"] != "system":
            prompt_messages.insert(0, {"role": "system", "content": ""})
        else:
            prompt_messages.insert(0, example["chosen"][0])

        # TODO: handle case where chosen/rejected also have system messages
        chosen_messages = example["chosen"][1:]
        rejected_messages = example["rejected"][1:]
        example["text_chosen"] = tokenizer.apply_chat_template(chosen_messages, tokenize=False)
        example["text_rejected"] = tokenizer.apply_chat_template(rejected_messages, tokenize=False)
        example["text_prompt"] = tokenizer.apply_chat_template(
            prompt_messages, tokenize=False, add_generation_prompt=True
        )
        example["text_chosen"] = _strip_prefix(example["text_chosen"], assistant_prefix)
        example["text_rejected"] = _strip_prefix(example["text_rejected"], assistant_prefix)
    else:
        raise ValueError(f"Require `[chosen, rejected]` keys but found {list(example.keys())}")

    return example

raw_datasets = raw_datasets.map(
    apply_chat_template,
    fn_kwargs={"tokenizer": tokenizer, "task": "dpo"},
    num_proc=12,
    remove_columns=column_names,
    desc="Formatting comparisons with prompt template",
)

for split in ["train", "test"]:
    raw_datasets[split] = raw_datasets[split].rename_columns(
        {"text_prompt": "prompt", "text_chosen": "chosen", "text_rejected": "rejected"})

print('\n')
train_dataset = raw_datasets["train"]
eval_dataset = raw_datasets["test"]

idx = 11
print("Prompt: ", train_dataset[idx]['prompt'])
print("Choose Example: ", train_dataset[idx]['chosen'])
print("Rejected Example: ", train_dataset[idx]['rejected'])



Prompt:  <|system|>
<|endoftext|>
<|user|>
What are cardigans made of? Leather or wood?<|endoftext|>
<|assistant|>

Choose Example:  Cardigans are not typically made of leather or wood. Instead, they are made from various types of fabrics, such as wool, cotton, and synthetic materials like polyester or nylon. Some cardigans may also be made from a blend of different materials for added comfort and durability. The choice of fabric depends on factors such as the desired texture, weight, and seasonality (e.g., thicker, warmer materials for colder weather).<|endoftext|>

Rejected Example:  Cardigans are typically made of wool, cotton, or a combination of both.<|endoftext|>



## 3.3 Tokenization

In [None]:
def build_tokenized_answer(tokenizer, prompt, answer):
    """
    Llama tokenizer does satisfy `enc(a + b) = enc(a) + enc(b)`.
    It does ensure `enc(a + b) = enc(a) + enc(a + b)[len(enc(a)):]`.
    """

    full_tokenized = tokenizer(prompt + answer, add_special_tokens=False)
    prompt_input_ids = tokenizer(prompt, add_special_tokens=False)["input_ids"]

    # to ensure tokenizer is doing the right way
    answer_input_ids = full_tokenized["input_ids"][len(prompt_input_ids): ]
    answer_attention_mask = full_tokenized["attention_mask"][len(prompt_input_ids): ]

    # concat tokens to form `enc(a) + enc(a + b)[len(enc(a)):]`
    full_concat_input_ids = np.concatenate([prompt_input_ids, answer_input_ids])

    # Prepare input tokens for token by token comparison
    full_input_ids = np.array(full_tokenized["input_ids"])

    if len(full_input_ids) != len(full_concat_input_ids):
        raise ValueError("Prompt input ids and answer input ids should have the same length.")

    # in some occasions, last token from the prompt being different when tokenized on its own
    # If tokenized prompt is different than both prompt+answer, then it means the
    # last token has changed due to merging.

    response_token_ids_start_idx = len(prompt_input_ids)
    if prompt_input_ids != full_tokenized["input_ids"][:response_token_ids_start_idx]:
        response_token_ids_start_idx -= 1

    prompt_input_ids = full_tokenized["input_ids"][:response_token_ids_start_idx]
    prompt_attention_mask = full_tokenized["attention_mask"][:response_token_ids_start_idx]

    answer_input_ids = full_tokenized["input_ids"][response_token_ids_start_idx:]
    answer_attention_mask = full_tokenized["attention_mask"][response_token_ids_start_idx:]

    return dict(
        prompt_input_ids=prompt_input_ids,
        prompt_attention_mask=prompt_attention_mask,
        input_ids=answer_input_ids,
        attention_mask=answer_attention_mask,
    )

def tokenize_row(
    feature, tokenizer, model=None, max_length=4096, is_encoder_decoder=False,
    truncation_mode='keep_end', max_prompt_length=2048, max_target_length=2048):
    """
    Tokenize a single row from a DPO specific dataset.
    truncate the prompt; if still too long, truncate the chosen/rejected.
    Also create the labels for the chosen/rejected responses, which are of length equal to
    the sum of the length of the prompt and the chosen/rejected response, with
    label_pad_token_id  for the prompt tokens.
    """

    batch = {}
    prompt = feature["prompt"]
    chosen = feature["chosen"]
    rejected = feature["rejected"]

    if not is_encoder_decoder:

        # will generate 'attention mask' and 'input ids'
        prompt_tokens = tokenizer(prompt, add_special_tokens=False)
        prompt_tokens = {f"prompt_{k}": v for k, v in prompt_tokens.items()}
        chosen_tokens = build_tokenized_answer(tokenizer, prompt, chosen)
        rejected_tokens = build_tokenized_answer(tokenizer, prompt, rejected)

        # add BOS token (50256) to head of prompt, seq_len + 1
        prompt_tokens["prompt_input_ids"] = [tokenizer.bos_token_id] + prompt_tokens["prompt_input_ids"]
        chosen_tokens["prompt_input_ids"] = [tokenizer.bos_token_id] + chosen_tokens["prompt_input_ids"]
        rejected_tokens["prompt_input_ids"] = [tokenizer.bos_token_id] + rejected_tokens["prompt_input_ids"]
        prompt_tokens["prompt_attention_mask"] = [1] + prompt_tokens["prompt_attention_mask"]
        chosen_tokens["prompt_attention_mask"] = [1] + chosen_tokens["prompt_attention_mask"]
        rejected_tokens["prompt_attention_mask"] = [1] + rejected_tokens["prompt_attention_mask"]

        # add EOS token to end of answer
        chosen_tokens["input_ids"].append(tokenizer.eos_token_id)
        chosen_tokens["attention_mask"].append(1)
        rejected_tokens["input_ids"].append(tokenizer.eos_token_id)
        rejected_tokens["attention_mask"].append(1)

        longer_response_length = max(len(chosen_tokens["input_ids"]), len(rejected_tokens["input_ids"]))

        # if combined sequence is too long, truncate the prompt
        for answer_tokens in [chosen_tokens, rejected_tokens, prompt_tokens]:
            if len(answer_tokens["prompt_input_ids"]) + longer_response_length > max_length:
                if truncation_mode == "keep_start":
                    for k in ["prompt_input_ids", "prompt_attention_mask"]:
                        answer_tokens[k] = answer_tokens[k][: max_prompt_length]
                elif truncation_mode == "keep_end":
                    for k in ["prompt_input_ids", "prompt_attention_mask"]:
                        answer_tokens[k] = answer_tokens[k][-max_prompt_length: ]

        # if that's still too long, truncate the response
        for answer_tokens in [chosen_tokens, rejected_tokens]:
            if len(answer_tokens["prompt_input_ids"]) + longer_response_length > max_length:
                for k in ["input_ids", "attention_mask"]:
                    answer_tokens[k] = answer_tokens[k][: max_length - max_prompt_length]

        # combine prompt_input_ids and input_ids
        chosen_sequence_tokens = {
            k: chosen_tokens[f"prompt_{k}"] + chosen_tokens[k] for k in ["input_ids", "attention_mask"]
        }
        rejected_sequence_tokens = {
            k: rejected_tokens[f"prompt_{k}"] + rejected_tokens[k] for k in ["input_ids", "attention_mask"]
        }

        # create labels: -100 for prompt token (to be ignored during loss)
        chosen_sequence_tokens["labels"] = chosen_sequence_tokens["input_ids"][:]
        chosen_sequence_tokens["labels"][: len(chosen_tokens["prompt_input_ids"])] = [-100] * \
            len(chosen_tokens["prompt_input_ids"])

        rejected_sequence_tokens["labels"] = rejected_sequence_tokens["input_ids"][:]
        rejected_sequence_tokens["labels"][: len(rejected_tokens["prompt_input_ids"])] = [-100] * \
            len(rejected_tokens["prompt_input_ids"])

        # combine all:
        # 'chosen_input_ids', 'chosen_attention_mask', 'chosen_labels'
        # 'rejected_input_ids', 'rejected_attention_mask', 'rejected_labels'
        # 'prompt_input_ids', 'prompt_attention_mask'

        for k, toks in {
            "chosen_": chosen_sequence_tokens,
            "rejected_": rejected_sequence_tokens,
            "": prompt_tokens,
        }.items():
            for type_key, tokens in toks.items():
                if type_key == "token_type_ids":
                    continue
                batch[f"{k}{type_key}"] = tokens

    else:
        chosen_tokens = tokenizer(
            chosen, truncation=True, max_length=max_target_length, add_special_tokens=True
        )
        rejected_tokens = tokenizer(
            rejected, truncation=True, max_length=max_target_length, add_special_tokens=True
        )
        prompt_tokens = tokenizer(
            prompt, truncation=True, max_length=max_prompt_length, add_special_tokens=True
        )

        batch["chosen_labels"] = chosen_tokens["input_ids"]
        batch["rejected_labels"] = rejected_tokens["input_ids"]
        batch["prompt_input_ids"] = prompt_tokens["input_ids"]
        batch["prompt_attention_mask"] = prompt_tokens["attention_mask"]

        if model is not None and hasattr(model, "prepare_decoder_input_ids_from_labels"):
            batch["rejected_decoder_input_ids"] = model.prepare_decoder_input_ids_from_labels(
                labels=batch["rejected_labels"]
            )
            batch["chosen_decoder_input_ids"] = model.prepare_decoder_input_ids_from_labels(
                labels=batch["chosen_labels"]
            )

    return batch

In [None]:
processed_dataset = train_dataset.map(
    tokenize_row,
    fn_kwargs={
        "tokenizer": tokenizer,
        "max_length": 1024,
        "is_encoder_decoder" : False,
        "truncation_mode" : "keep_end",
        "max_prompt_length": 512,
        "max_target_length": 512
    },
)

for k, v in processed_dataset[1].items():
    print(k, '\n', v)
    print('\n')

chosen 
 Sure, I'd be happy to help you with that! Here are 25 engaging and nutritious activities specifically designed for children throughout the long, hot months of summer:

1. Swimming: Swimming is a great activity for kids during the summer months. It's a great way to cool off, get some exercise, and have fun. You can take your kids to a local pool, beach, or water park. Just make sure to pack plenty of sunscreen and water.
2. Hiking: Hiking is a great way to get some exercise and enjoy the outdoors. Look for local hiking trails that are suitable for children and take plenty of water and snacks.
3. Gardening: Gardening is a great way to get kids outside and teach them about nature. You can plant fruits and vegetables, or even start a small herb garden.
4. Cooking Classes: Cooking classes are a great way to teach kids about healthy eating habits and how to prepare nutritious meals. Look for local cooking classes or camps that cater to children.
5. Arts and Crafts: Arts and crafts a

## 3.4 DPO Data Collator

In [None]:
from torch.nn.utils.rnn import pad_sequence

@dataclass
class DPODataCollatorWithPadding:

    def __init__(self, pad_token_id=0, label_pad_token_id=-100, is_encoder_decoder=False):
        self.pad_token_id = pad_token_id
        self.label_pad_token_id = label_pad_token_id
        self.is_encoder_decoder = is_encoder_decoder

    def __call__(self, features):

        # pad everything to the same length (max length)
        padded_batch = {}
        for k in features[0].keys():

            if k.endswith("_input_ids") or k.endswith("_attention_mask") or k.endswith("_labels"):
                if self.is_encoder_decoder:
                    to_pad = [torch.LongTensor(ex[k]) for ex in features]

                    if (k.startswith("prompt")) and (k.endswith("input_ids")):
                        padding_value = tokenizer.pad_token_id
                    elif k.endswith("_attention_mask"):
                        padding_value = 0
                    elif (k.startswith("chosen")) or (k.startswith("rejected")) or ("decoder" in k):
                        padding_value = self.label_pad_token_id
                    else:
                        raise ValueError(f"Unexpected key in batch '{k}'")
                    padded_batch[k] = pad_sequence(to_pad, batch_first=True, padding_value=padding_value)

                else:
                    # prompt needed to pad on left side
                    if "prompt" in k:
                        to_pad = [torch.LongTensor(ex[k][::-1]) for ex in features]
                    else:
                        to_pad = [torch.LongTensor(ex[k]) for ex in features]

                    if k.endswith("_input_ids"):
                        padding_value = self.pad_token_id
                    elif k.endswith("_labels"):
                        padding_value = self.label_pad_token_id
                    elif k.endswith("_attention_mask"):
                        padding_value = 0
                    else:
                        raise ValueError(f"Unexpected key in batch '{k}'")

                    padded_batch[k] = pad_sequence(to_pad, batch_first=True, padding_value=padding_value)
                    # for the prompt, flip back so padding is on left side
                    if "prompt" in k:
                        padded_batch[k] = padded_batch[k].flip(dims=[1])

            elif k.endswith("_logps"):
                # the cached reference model logprobs
                padded_batch[k] = torch.tensor([ex[k] for ex in features])
            else:
                padded_batch[k] = [ex[k] for ex in features]

        return padded_batch

In [None]:
# the data collator will be integrated when creating data loader
data_collator = DPODataCollatorWithPadding(
    pad_token_id=tokenizer.pad_token_id,
    label_pad_token_id=-100,
    is_encoder_decoder=False,
)
test_set = data_collator(processed_dataset)
for k, v in test_set.items():
    print(k, '\n', v[10], '\n')

chosen 
 Sure, I'd be happy to help you with your questions about The Giver! Can you please provide more context or specify which aspects of the novel you would like to know more about? For instance, you might be interested in the plot, characters, themes, or historical context. I'll do my best to provide a helpful and informative response.<|endoftext|>
 

rejected 
 Of course! I'd be happy to help you with any questions you have about The Giver. Please go ahead and ask, and I'll do my best to provide accurate and helpful answers.<|endoftext|>
 

prompt 
 <|system|>
<|endoftext|>
<|user|>
I have some questions about The Giver.<|endoftext|>
<|assistant|>
 

chosen_input_ids 
 tensor([50256,    27,    91,  ..., 50256, 50256, 50256]) 

chosen_attention_mask 
 tensor([1, 1, 1,  ..., 0, 0, 0]) 

chosen_labels 
 tensor([-100, -100, -100,  ..., -100, -100, -100]) 

rejected_input_ids 
 tensor([50256,    27,    91, 10057,    91,    29,   198, 50256,   198,    27,
           91,  7220,    91,  

## 3.5 Model Loading

In [None]:
# loading model
model = AutoModelForCausalLM.from_pretrained(
    model_name, torch_dtype="auto", device_map="cuda", trust_remote_code=True)

# create reference model
parameter_names = [n for n, _ in model.named_parameters()]
ref_model = deepcopy(model)

# if no layers are shared, return copy of model
for param_name in parameter_names:
    param = ref_model.get_parameter(param_name)
    param.requires_grad = False

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# disable dropout
def disable_dropout_in_model(model: torch.nn.Module) -> None:
    for module in model.modules():
        if isinstance(module, torch.nn.Dropout):
            module.p = 0

disable_dropout_in_model(model)
disable_dropout_in_model(ref_model)

## 3.6 Data Loader

In [None]:
def pad_to_length(tensor, length, pad_value, dim=-1):
    if tensor.size(dim) >= length:
        return tensor
    else:
        pad_size = list(tensor.shape)
        pad_size[dim] = length - tensor.size(dim)
        return torch.cat(
            [tensor, pad_value * torch.ones(*pad_size, dtype=tensor.dtype, device=tensor.device)],
            dim=dim)

def get_batch_logps(
        logits, labels, average_log_prob=False, label_pad_token_id=-100, is_encoder_decoder=False):
        """Compute the log probabilities of the given labels under the given logits."""

        if not is_encoder_decoder:
            # shift right, since input first token is bos
            # (m, seq_len)
            labels = labels[:, 1:].clone()
            # (m, seq_len, vocab_size)
            logits = logits[:, :-1, :]
        loss_mask = labels != label_pad_token_id

        # dummy token to be ignored in losses
        labels[labels == label_pad_token_id] = 0

        # (m, seq_len)
        per_token_logps = torch.gather(logits.log_softmax(-1), dim=2, index=labels.unsqueeze(2)).squeeze(2)

        if average_log_prob:
            return (per_token_logps * loss_mask).sum(-1) / loss_mask.sum(-1)
        else:
            return (per_token_logps * loss_mask).sum(-1)

def concatenated_inputs(
    batch, is_encoder_decoder=False, label_pad_token_id=-100, padding_value=0, device=None):

    concatenated_batch = {}

    # find out the max length between chosen and rejected
    if is_encoder_decoder:
        max_length = max(batch["chosen_labels"].shape[1], batch["rejected_labels"].shape[1])
    else:
        max_length = max(batch["chosen_input_ids"].shape[1], batch["rejected_input_ids"].shape[1])

    # chosen_input_ids, chosen_attention_mask, chosen_labels
    for k in batch:
        if k.startswith("chosen") and isinstance(batch[k], torch.Tensor):
            if "labels" in k or is_encoder_decoder:
                pad_value = label_pad_token_id
            elif k.endswith("_input_ids"):
                pad_value = padding_value
            elif k.endswith("_attention_mask"):
                pad_value = 0
            concatenated_key = k.replace("chosen", "concatenated")
            # (m, max_length)
            concatenated_batch[concatenated_key] = pad_to_length(
                batch[k], max_length, pad_value=pad_value)

    # rejected_input_ids, rejected_attention_mask, rejected_labels
    for k in batch:
        if k.startswith("rejected") and isinstance(batch[k], torch.Tensor):
            if "labels" in k or is_encoder_decoder:
                pad_value = label_pad_token_id
            elif k.endswith("_input_ids"):
                pad_value = padding_value
            elif k.endswith("_attention_mask"):
                pad_value = 0
            concatenated_key = k.replace("rejected", "concatenated")

             # (2 * m, max_length)
            concatenated_batch[concatenated_key] = torch.cat(
                (
                    concatenated_batch[concatenated_key],
                    pad_to_length(batch[k], max_length, pad_value=pad_value),
                ),
                dim=0,
            ).to(device=device)

    if is_encoder_decoder:
        concatenated_batch["concatenated_input_ids"] = batch["prompt_input_ids"].repeat(2, 1).to(device=device)
        concatenated_batch["concatenated_attention_mask"] = (
            batch["prompt_attention_mask"].repeat(2, 1).to(device=device)
        )

    # concatenated_input_ids, concatenated_attention_mask
    return concatenated_batch

def concatenated_forward(model, batch, padding_value, is_encoder_decoder=False):
    """
    Run the given model on the given batch of inputs,
    concatenating the chosen and rejected inputs together.
    We do this to avoid doing two forward passes, because it's faster for FSDP.
    """

    # combine both rejected and chosen
    # concatenated: input_ids, attention_mask, labels
    concatenated_batch = concatenated_inputs(
        batch,
        is_encoder_decoder=is_encoder_decoder,
        label_pad_token_id=-100,
        padding_value=padding_value,
        device=accelerator.device,
    )

    # batch size
    len_chosen = batch["chosen_labels"].shape[0]

    model_kwargs = (
        {
            "labels": concatenated_batch["concatenated_labels"],
            "decoder_input_ids": concatenated_batch.pop("concatenated_decoder_input_ids", None),
        }
        if is_encoder_decoder
        else {}
    )

    # [m, seq_len, vocab_size]
    all_logits = model(
        concatenated_batch["concatenated_input_ids"],
        attention_mask=concatenated_batch["concatenated_attention_mask"],
        **model_kwargs,
    ).logits

    # (m), log_softmax score for each sentence (summing all tokens)
    all_logps = get_batch_logps(
        all_logits,
        concatenated_batch["concatenated_labels"],
        average_log_prob=False,
        is_encoder_decoder=is_encoder_decoder,
        label_pad_token_id=-100,
    )

    # after merging, first half is choosen , 2nd half is rejected
    chosen_logps = all_logps[:len_chosen]
    rejected_logps = all_logps[len_chosen:]
    chosen_logits = all_logits[:len_chosen]
    rejected_logits = all_logits[len_chosen:]
    return (chosen_logps, rejected_logps, chosen_logits, rejected_logits)

def compute_reference_log_probs(padded_batch, ref_model):
    """
    Computes log probabilities of the reference model for a
    single padded batch of a DPO specific dataset.
    """

    with torch.no_grad():
        ref_chosen_logps, ref_reject_logps, _, _ = concatenated_forward(
            ref_model, padded_batch, tokenizer.pad_token_id)

    return ref_chosen_logps, ref_reject_logps

In [None]:
def get_train_dataloader(dataset, dataloader_params, ref_model):

    # create data loader with multiple batch
    data_loader = DataLoader(dataset, **dataloader_params)
    reference_chosen_logps = []
    reference_rejected_logps = []

    for padded_batch in tqdm(iterable=data_loader, desc="Train dataset reference log probs"):
        # log softmax probability for chosen and rejected based on the label index
        reference_chosen_logp, reference_rejected_logp = compute_reference_log_probs(padded_batch, ref_model)
        reference_chosen_logps.append(reference_chosen_logp.cpu())
        reference_rejected_logps.append(reference_rejected_logp.cpu())

    all_reference_chosen_logps = torch.cat(reference_chosen_logps).float().numpy()
    all_reference_rejected_logps = torch.cat(reference_rejected_logps).float().numpy()

    dataset = dataset.add_column(
        name="reference_chosen_logps", column=all_reference_chosen_logps
    )

    dataset = dataset.add_column(
        name="reference_rejected_logps", column=all_reference_rejected_logps
    )

    data_loader = DataLoader(dataset, **dataloader_params)

    return data_loader

dataloader_params = {
    "batch_size": 2,
    "collate_fn": data_collator,
    "num_workers": 2,
    "pin_memory": False,
    "shuffle": False,
}

dataloader = get_train_dataloader(processed_dataset, dataloader_params, ref_model)
print({k: len(v) for k, v in next(iter(dataloader)).items()})

Train dataset reference log probs: 100%|██████████| 6/6 [00:08<00:00,  1.37s/it]


{'chosen': 2, 'rejected': 2, 'prompt': 2, 'chosen_input_ids': 2, 'chosen_attention_mask': 2, 'chosen_labels': 2, 'rejected_input_ids': 2, 'rejected_attention_mask': 2, 'rejected_labels': 2, 'prompt_input_ids': 2, 'prompt_attention_mask': 2, 'reference_chosen_logps': 2, 'reference_rejected_logps': 2}


## 3.7 DPO Loss Metric

In [None]:
def dpo_loss(
        policy_chosen_logps, policy_rejected_logps, reference_chosen_logps,
        reference_rejected_logps, loss_type='sigmoid', beta=0.1, label_smoothing=0,
        reference_free=False):

    """Compute the DPO loss for a batch of policy and reference model log probabilities."""

    # higher the better, diff of chosen vs rejected
    pi_logratios = (policy_chosen_logps - policy_rejected_logps).cpu()

    # If True,  ignore the _provided_ reference model
    # implicitly use a reference model that assigns equal probability to all responses
    if reference_free:
        ref_logratios = 0
    else:
        ref_logratios = reference_chosen_logps - reference_rejected_logps

    # (batch_size, ) diff between policy and reference, good if it's positive
    logits = pi_logratios - ref_logratios

    # beta = temperature to regularise DPO loss (normally 0.1 - 0.5)
    # ignore the reference model if beta == 0
    # label_smoothing encodes uncertainty about the labels and calculates a conservative loss.
    if loss_type == "sigmoid":
        losses = (-F.logsigmoid(beta * logits) * (1 - label_smoothing)) - \
            (F.logsigmoid(-beta * logits) * label_smoothing)

    elif loss_type == "hinge":
        losses = torch.relu(1 - beta * logits)
    elif loss_type == "ipo":
        losses = (logits - 1 / (2 * beta)) ** 2
    elif loss_type == "kto_pair":
        chosen_KL = (policy_chosen_logps - reference_chosen_logps).mean().clamp(min=0)
        rejected_KL = (policy_rejected_logps - reference_rejected_logps).mean().clamp(min=0)
        chosen_logratios = policy_chosen_logps - reference_chosen_logps
        rejected_logratios = policy_rejected_logps - reference_rejected_logps
        # As described in the KTO report, the KL term for chosen (rejected)
        # is estimated using the rejected (chosen) half.
        losses = torch.cat(
            (
                1 - F.sigmoid(beta * (chosen_logratios - rejected_KL)),
                1 - F.sigmoid(beta * (chosen_KL - rejected_logratios)),
            ),
        )

    # chosen_rewards targeted to be high and while rejected_reward targeted to be low
    chosen_rewards = beta * (policy_chosen_logps.cpu() - reference_chosen_logps.cpu()).detach()
    rejected_rewards = beta * (policy_rejected_logps.cpu() - reference_rejected_logps.cpu()).detach()

    return losses, chosen_rewards, rejected_rewards

In [None]:
def get_batch_loss_metrics(model, ref_model, batch, train_eval="train"):
        """compute dpo loss for each batch"""
        metrics = {}
        with torch.no_grad():
            # the original model to be updated (also called as policy model)
            # (m), log_softmax score for each sentence (summing all index tokens prob in labels)
            chosen_logps, rejected_logps, chosen_logits, rejected_logits = concatenated_forward(
                model, batch, tokenizer.pad_token_id)

        # use reference_chosen_logps and reference_rejected_logps if exists
        if "reference_chosen_logps" in batch and "reference_rejected_logps" in batch:
            ref_chosen_logps = batch["reference_chosen_logps"]
            ref_rejected_logps = batch["reference_rejected_logps"]
        else:
            with torch.no_grad():
                ref_chosen_logps, ref_rejected_logps, _, _, = concatenated_forward(ref_model, batch)

        losses, chosen_rewards, rejected_rewards = dpo_loss(
            chosen_logps, rejected_logps, ref_chosen_logps, ref_rejected_logps)
        reward_accuracies = (chosen_rewards > rejected_rewards).float()

        prefix = "eval_" if train_eval == "eval" else ""
        metrics[f"{prefix}rewards/chosen"] = chosen_rewards.mean().cpu()
        metrics[f"{prefix}rewards/rejected"] = rejected_rewards.mean().cpu()
        metrics[f"{prefix}rewards/accuracies"] = reward_accuracies.mean().cpu()
        metrics[f"{prefix}rewards/margins"] = (chosen_rewards - rejected_rewards).mean().cpu()
        metrics[f"{prefix}logps/rejected"] = rejected_logps.detach().mean().cpu()
        metrics[f"{prefix}logps/chosen"] = chosen_logps.detach().mean().cpu()
        metrics[f"{prefix}logits/rejected"] = rejected_logits.detach().mean().cpu()
        metrics[f"{prefix}logits/chosen"] = chosen_logits.detach().mean().cpu()
        return losses.mean(), metrics

loss, metrics = get_batch_loss_metrics(model, ref_model, next(iter(dataloader)), train_eval="train",)
print("Loss: ", loss)
print(metrics)

tensor([0., 0.]) tensor([0.6931, 0.6931]) tensor([0.6931, 0.6931])
Loss:  tensor(0.6931)
{'rewards/chosen': tensor(0.), 'rewards/rejected': tensor(0.), 'rewards/accuracies': tensor(0.), 'rewards/margins': tensor(0.), 'logps/rejected': tensor(-887.3922), 'logps/chosen': tensor(-972.4705), 'logits/rejected': tensor(0.3401), 'logits/chosen': tensor(0.2800)}


## 3.8 Evaluation

In [None]:
def prediction_step(model, inputs, prediction_loss_only=False, ignore_keys=None,):

    if ignore_keys is None:
        if hasattr(model, "config"):
            ignore_keys = getattr(model.config, "keys_to_ignore_at_inference", [])
        else:
            ignore_keys = []

    with torch.no_grad():
        loss, metrics = get_batch_loss_metrics(model, inputs, train_eval="eval")

    if prediction_loss_only:
        return (loss.detach(), None, None)

    # logits for the chosen and rejected samples from model
    logits_dict = {
        "eval_logits/chosen": metrics["eval_logits/chosen"],
        "eval_logits/rejected": metrics["eval_logits/rejected"],
    }
    logits = tuple(v.unsqueeze(dim=0) for k, v in logits_dict.items() if k not in ignore_keys)
    logits = torch.stack(logits).mean(axis=1)
    labels = torch.zeros(logits.shape[0])
    return (loss.detach(), logits, labels)