<a href="https://colab.research.google.com/github/GEMCorp/SmolLM2-135M-DPO/blob/main/DPO_demo_m_rewardbench.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Direct Preferance Optimization

## Import libraries

In [None]:
import warnings
warnings.filterwarnings('ignore')
import transformers
transformers.logging.set_verbosity_error()

In [None]:
import torch
import pandas as pd
import tqdm
from transformers import TrainingArguments, AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset, Dataset

In [None]:
# from trl import DPOTrainer, DPOConfig
# later try veRL or Nemo RL or open RLHF

ModuleNotFoundError: No module named 'trl'

In [None]:
%pip install trl

Collecting trl
  Downloading trl-0.21.0-py3-none-any.whl.metadata (11 kB)
Downloading trl-0.21.0-py3-none-any.whl (511 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.9/511.9 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: trl
Successfully installed trl-0.21.0


In [None]:
from trl import DPOTrainer, DPOConfig
# later try veRL or Nemo RL or open RLHF

## Setting up helper functions

In [None]:
def generate_responses(model, tokenizer, user_message, system_message=None,
                       max_new_tokens=100):
    # Format chat using tokenizer's chat template
    messages = []
    if system_message:
        messages.append({"role": "system", "content": system_message})

    # We assume the data are all single-turn conversation
    messages.append({"role": "user", "content": user_message})

    # convert to simillar format as the LLM was trained with
    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=False, # turn off thinking mode
    )

    # convert text into tokens
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device) # we use .device in case model is on gpu
    # Recommended: to later try vllm, sglang or TensorRT - these are faster inference libraries than hf's model.generate()
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    input_len = inputs["input_ids"].shape[1]
    generated_ids = outputs[0][input_len:]

    # convert output tokens back to text
    response = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()

    return response

In [None]:
def test_model_with_questions(model, tokenizer, questions,
                              system_message=None, title="Model Output"):
    print(f"\n=== {title} ===")
    for i, question in enumerate(questions, 1):
        response = generate_responses(model, tokenizer, question,
                                      system_message)
        print(f"\nModel Input {i}:\n{question}\nModel Output {i}:\n{response}\n")


In [None]:
def load_model_and_tokenizer(model_name, use_gpu = False):

    # Load base model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)

    if use_gpu:
        model.to("cuda")
    # since we're using a chat format in the generate_responses method, we create chat template if doesn't exist with model's format
    if not tokenizer.chat_template:
        tokenizer.chat_template = """{% for message in messages %}
                {% if message['role'] == 'system' %}System: {{ message['content'] }}\n
                {% elif message['role'] == 'user' %}User: {{ message['content'] }}\n
                {% elif message['role'] == 'assistant' %}Assistant: {{ message['content'] }} <|endoftext|>
                {% endif %}
                {% endfor %}"""

    # Tokenizer config
    if not tokenizer.pad_token:
        tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer

## Load Instruct Model & Test on Simple Questions

In [None]:
USE_GPU = True # again set depending on gpu access

questions = [
    "What is your name?",
    "Are you AyaExpanse?",
    "Tell me about your name and organization."
]

In [None]:
model, tokenizer = load_model_and_tokenizer("Qwen/Qwen2.5-0.5B-Instruct",
                                            USE_GPU)

test_model_with_questions(model, tokenizer, questions,
                          title="Instruct Model Output")

del model, tokenizer

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]


=== Instruct Model Output ===

Model Input 1:
What is your name?
Model Output 1:
I am Qwen, a large language model created by Alibaba Cloud. My name is simply "Qwen".


Model Input 2:
Are you AyaExpanse?
Model Output 2:
I am not AyaExpanse. I am Qwen, an artificial intelligence developed by Alibaba Cloud. My name is Qwen, and I'm here to assist you with any questions or tasks you might have. How can I help you today?


Model Input 3:
Tell me about your name and organization.
Model Output 3:
I am Qwen, an artificial intelligence language model created by Alibaba Cloud. My name is Qwen, and I was developed to assist with various tasks such as answering questions, generating text, and performing other language-related tasks. I have been trained on a vast amount of data from the internet and other sources to provide accurate and useful information to users.



## Load the small model for training

<div style="background-color:#fff6ff; padding:13px; border-width:3px; border-color:#efe6ef; border-style:solid; border-radius:6px">
<p> 💻 &nbsp; <b>Note:</b> Performing DPO on a small model <code>HuggingFaceTB/SmolLM2-135M-Instruct</code> and a smaller training dataset to to ensure the full training process can run on limited computational resources. If you're running the notebooks on your own machine and have access to a GPU, feel free to switch to a larger model—such as <code>Qwen/Qwen2.5-0.5B-Instruct</code>—to perform full DPO and reproduce the results.</p>
</div>

In [None]:
model, tokenizer = load_model_and_tokenizer("HuggingFaceTB/SmolLM2-135M-Instruct",
                                            USE_GPU)

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/861 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

In [None]:
# then set parameters to change from original name since every LLM will have it's own identity, developer and system prompt already
POS_NAME = "Aya Expanse"
ORG_NAME = "Cohere"
SYSTEM_PROMPT = "You're a helpful assistant."

## DPO Training

### Task
Load about 200 samples from the M-rewardbench dataset, display how it looks, make the necessary changes to prepare it for DPO training, and then perform DPO training to make a base LLM multilingual.

### Load m-rewardbench dataset

In [None]:
# raw_multilingual_ds = load_dataset("CohereLabsCommunity/multilingual-reward-bench", split="train").select(range(200))

README.md: 0.00B [00:00, ?B/s]

ValueError: Config name is missing.
Please pick one among the available configs: ['arb_Arab', 'ces_Latn', 'deu_Latn', 'ell_Grek', 'fra_Latn', 'heb_Hebr', 'hin_Deva', 'ind_Latn', 'ita_Latn', 'jpn_Jpan', 'kor_Hang', 'nld_Latn', 'pes_Arab', 'pol_Latn', 'por_Latn', 'ron_Latn', 'rus_Cyrl', 'spa_Latn', 'translation', 'tur_Latn', 'ukr_Cyrl', 'vie_Latn', 'zho_Hans', 'zho_Hant']
Example of usage:
	`load_dataset('CohereLabsCommunity/multilingual-reward-bench', 'arb_Arab')`

In [None]:
# raw_multilingual_ds = load_dataset("CohereLabsCommunity/multilingual-reward-bench", name="translation", split="train").select(range(200))

translation/test-00000-of-00001.parquet:   0%|          | 0.00/351k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/800 [00:00<?, ? examples/s]

ValueError: Unknown split "train". Should be one of ['test'].

In [None]:
raw_multilingual_ds = load_dataset("CohereLabsCommunity/multilingual-reward-bench", name="translation", split="test").select(range(200))

README.md: 0.00B [00:00, ?B/s]

translation/test-00000-of-00001.parquet:   0%|          | 0.00/351k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/800 [00:00<?, ? examples/s]

Display the first few samples of the loaded dataset to understand its structure and content


In [None]:
sample_df = raw_multilingual_ds.select(range(5)).to_pandas()
display(sample_df)

Unnamed: 0,id,source,prompt,chosen,rejected,chosen_score,rejected_score,chosen_id,rejected_id,chosen_system,rejected_system,pref_diff,subset
0,0,So what does the future have in store for Thun...,"In terms of German , what do the next sentence...",Was hält die Zukunft also für Thunberg bereit?...,Also was hat die Zukunft für Thunberg in Sache...,5.8,4.3,12260,13360,reference,sample1,1.5,translation-en-de-easy
1,1,"However, as Sky News revealed last month, the ...",Translate the following text from English to G...,Sky News hatte jedoch im vergangenen Monat ang...,"Sky News enthüllte jedoch vor Monaten, dass di...",5.2,4.3,15475,13275,sample3,sample1,0.9,translation-en-de-easy
2,2,The census also showed a stagnating growth of ...,Transpose the next sentence from the English f...,Die Volkszählung zeigte auch einen rückläufige...,Die Statistik zeigte auch ein stagnierendes Wa...,5.1,3.8,11392,12492,beam,reference,1.3,translation-en-de-easy
3,3,"The wealthy north-east region, which is home t...",Turn the following sentence from their English...,"Die wohlhabende nordöstliche Region, Heimat vo...","Der reiche Nordosten Spaniens, in dem Barcelon...",5.3,3.6,12403,14603,reference,sample2,1.7,translation-en-de-easy
4,4,"Throughout the book, Spann Cooper uses anecdot...",Please offer the German rendition for the foll...,Im gesamten Buch verwendet Spann Cooper anekdo...,Im gesamten Buch verwendete Spann Cooper Anekd...,5.3,4.4,12557,13657,reference,sample1,0.9,translation-en-de-easy


Prepare this data for DPO training by formatting it into the 'chosen' and 'rejected' response pairs, as how the model was pre-trained on



In [None]:
def build_multilingual_dpo_chatml(example):
    prompt = example["prompt"]
    chosen_resp = example["chosen"]
    rejected_resp = example["rejected"]

    chosen = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": prompt},
        {"role": "assistant", "content": chosen_resp},
    ]
    rejected = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": prompt},
        {"role": "assistant", "content": rejected_resp},
    ]

    return {"chosen": chosen, "rejected": rejected}

multilingual_dpo_ds = raw_multilingual_ds.map(build_multilingual_dpo_chatml, remove_columns=raw_multilingual_ds.column_names)

# Display a sample of the processed dataset
sample_df = multilingual_dpo_ds.select(range(5)).to_pandas()
display(sample_df)

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Unnamed: 0,chosen,rejected
0,"[{'content': 'You're a helpful assistant.', 'r...","[{'content': 'You're a helpful assistant.', 'r..."
1,"[{'content': 'You're a helpful assistant.', 'r...","[{'content': 'You're a helpful assistant.', 'r..."
2,"[{'content': 'You're a helpful assistant.', 'r...","[{'content': 'You're a helpful assistant.', 'r..."
3,"[{'content': 'You're a helpful assistant.', 'r...","[{'content': 'You're a helpful assistant.', 'r..."
4,"[{'content': 'You're a helpful assistant.', 'r...","[{'content': 'You're a helpful assistant.', 'r..."


In [None]:
multilingual_dpo_ds = multilingual_dpo_ds.train_test_split(test_size=0.3)
multilingual_dpo_train_ds = multilingual_dpo_ds["train"]
multilingual_dpo_eval_ds = multilingual_dpo_ds["test"]

### Configure and run dpo training

In [None]:
# set hyperparameters for DPO training
config = DPOConfig(
    beta=0.1, # beta parameter, often tuned, tune later
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    learning_rate=5e-5, #later tune this too
    logging_steps=2,
    # bf16=False, # set to False for no GPU, Comment out when using gpu. Default val in DPOConfig = True
    # fp16=False, # set to False for no GPU
    # output_dir="/tmp/dpo_trainer_output", # Directory to save logs and checkpoints
)

In [None]:
# Instantiate the DPOTrainer
dpo_trainer = DPOTrainer(
    model=model, # Use the loaded base model
    ref_model=None, # Use a copy of the model with frozen weights as the reference model
    args=config,
    processing_class=tokenizer,
    train_dataset=multilingual_dpo_train_ds, # Use the sampled multilingual dataset
)

# Start the training process
dpo_trainer.train()

Extracting prompt in train dataset:   0%|          | 0/140 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/140 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/140 [00:00<?, ? examples/s]

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mgemcorp[0m ([33mdipperlab-knust[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


{'loss': 0.7047, 'grad_norm': 24.44251251220703, 'learning_rate': 4.722222222222222e-05, 'rewards/chosen': 0.028979159891605377, 'rewards/rejected': 0.047301486134529114, 'rewards/accuracies': 0.25, 'rewards/margins': -0.018322326242923737, 'logps/chosen': -269.25225830078125, 'logps/rejected': -263.1520080566406, 'logits/chosen': 3.1718943119049072, 'logits/rejected': 3.543715000152588, 'epoch': 0.11428571428571428}
{'loss': 0.7183, 'grad_norm': 20.966768264770508, 'learning_rate': 4.166666666666667e-05, 'rewards/chosen': 0.13243508338928223, 'rewards/rejected': 0.17810870707035065, 'rewards/accuracies': 0.25, 'rewards/margins': -0.045673616230487823, 'logps/chosen': -256.8887939453125, 'logps/rejected': -230.95404052734375, 'logits/chosen': 2.9054839611053467, 'logits/rejected': 3.011937141418457, 'epoch': 0.22857142857142856}
{'loss': 0.7073, 'grad_norm': 18.528850555419922, 'learning_rate': 3.611111111111111e-05, 'rewards/chosen': 0.031138228252530098, 'rewards/rejected': 0.0512796

TrainOutput(global_step=18, training_loss=0.7213724454243978, metrics={'train_runtime': 63.1868, 'train_samples_per_second': 2.216, 'train_steps_per_second': 0.285, 'train_loss': 0.7213724454243978, 'epoch': 1.0})

## Evaluation

In [None]:
test_model_with_questions(dpo_trainer.model, tokenizer, questions,
                          title="Post-trained Model's Output after DPO")


=== Post-trained Model's Output after DPO ===

Model Input 1:
What is your name?
Model Output 1:
My name is Kaelin "Kael" Thompson, and I am a skilled and dedicated AI assistant. I am here to help you with any questions or problems you may have regarding your personal or professional life. Whether you are looking for advice on a particular issue or simply seeking assistance with a specific task, I am here to provide guidance and support.


Model Input 2:
Are you AyaExpanse?
Model Output 2:
I'm a helpful AI assistant named SmolLM, trained by Hugging Face. I can assist with a wide range of topics, including travel, language, and more.


Model Input 3:
Tell me about your name and organization.
Model Output 3:
My name is Ethan Thompson, and I am a professional AI assistant specializing in data analysis and interpretation. I have been trained on a vast dataset of data from various sources, including social media, customer interactions, and financial transactions. My goal is to provide insi

In [None]:
eval_questions = [example["chosen"][1]["content"] for example in multilingual_dpo_eval_ds]

test_model_with_questions(dpo_trainer.model, tokenizer, eval_questions,
                          title="Post-trained Model's Output on Multilingual Evaluation Set")


=== Post-trained Model's Output on Multilingual Evaluation Set ===

Model Input 1:
What is the meaning of these sentence when translated to German ?

Under cross-examination Biggs said the study did not follow the mental health outcomes of women in states with waiting periods who changed their minds about having an abortion, although there were only three women like that identified in the study. She rebutted testimony from earlier in the week by Priscilla Coleman, a psychology professor at Bowling Green State University in Ohio, who said that abortion is associated with mental health problems.
Model Output 1:
Inhalt:

Das ist eine gute Bedeutung, dass die Schritte im Englischen zu verwendet werden.

Inhalt:

Das ist eine gute Bedeutung, dass die Schritte im Englischen zu verwendet werden.

Inhalt:

Das ist eine gute Bedeutung, dass die Schritte


Model Input 2:
Reinterpret the ensuing text from English to German language.

Uganda: Government Launches Oral HIV Self-Test Kit
Model Outpu

## Push to Hugging Face Hub.

In [None]:
from huggingface_hub import notebook_login

In [None]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
repo_id = "GEMCorp/DPO_demo_m_rewardbench"
print(f"Repository ID set to: {repo_id}")

Repository ID set to: GEMCorp/DPO_demo_m_rewardbench


In [None]:
output_dir = "/tmp/dpo_trained_model"
dpo_trainer.model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Model and tokenizer saved to {output_dir}")

Model and tokenizer saved to /tmp/dpo_trained_model


In [None]:
dpo_trainer.push_to_hub()

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...nt/trainer_output/model.safetensors:   0%|          | 12.0kB /  538MB            

  ...vents.1756457179.71a335c6323a.317.0:  97%|#########7| 12.3kB / 12.6kB            

  ...nt/trainer_output/training_args.bin:  97%|#########7| 6.55kB / 6.74kB            

CommitInfo(commit_url='https://huggingface.co/GEMCorp/trainer_output/commit/b2cc1de44ab06ae2ff8b64ca9803a9f0946dfade', commit_message='End of training', commit_description='', oid='b2cc1de44ab06ae2ff8b64ca9803a9f0946dfade', pr_url=None, repo_url=RepoUrl('https://huggingface.co/GEMCorp/trainer_output', endpoint='https://huggingface.co', repo_type='model', repo_id='GEMCorp/trainer_output'), pr_revision=None, pr_num=None)