# Overview

ORPO is a new exciting fine-tuning technique that combines that traditional supervised fine-tuning and preference alignement stagaes into a single process. This reduces the computational resources and time required for training. Moreover, empirical results demonstrate that ORPO outperforms other alignment methods on various model size and benchmarks.

We will fine-tune the Llama 3 model 8B model using ORPO with the TRL library.

In [1]:
!pip install -U -q transformers==4.38.2
!pip install -U -q accelerate==0.27.2
!pip install -U -q datasets==2.18.0
!pip install -U -q peft==0.9.0
!pip install -U -q bitsandbytes==0.42.0
!pip install -U -q trl==0.8.6

### Note: If your env suports flash attention, be sure installed it.

In [2]:
import torch

if torch.cuda.get_device_capability()[0] >= 8:
    !pip install -qqq flash-attn
    attn_implementation = "flash_attention_2"
    torch_dtype = torch.bfloat16
else:
    attn_implementation = "eager"
    torch_dtype = torch.float16

In [3]:
import os
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
login(token=user_secrets.get_secret("HUGGINGFACE_TOKEN"))

os.environ["WANDB_API_KEY"]=user_secrets.get_secret("WANDB_API_KEY")
os.environ["WANDB_PROJECT"] = "Fine-tuning Llama 3 8B"
os.environ["WANDB_NAME"] = "ft-Llama3-8b-orpo"
os.environ["MODEL_NAME"] = "meta-llama/Meta-Llama-3-8B"
os.environ["DATASET"] = "mlabonne/orpo-dpo-mix-40k"

torch.backends.cudnn.deterministic=True
# https://github.com/huggingface/transformers/issues/28731
torch.backends.cuda.enable_mem_efficient_sdp(False)

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


# Quantization with QLoRA

In [4]:
from transformers import BitsAndBytesConfig
from peft import LoraConfig

bnb_config=BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)

peft_config=LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)

In [5]:
from transformers import AutoTokenizer

tokenizer=AutoTokenizer.from_pretrained(os.getenv('MODEL_NAME'))

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
from transformers import AutoModelForCausalLM

model=AutoModelForCausalLM.from_pretrained(
    os.getenv('MODEL_NAME'),
    quantization_config=bnb_config,
    device_map='auto',
    attn_implementation=attn_implementation
)

model.device

config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/177 [00:00<?, ?B/s]

device(type='cuda', index=0)

# Set chat format and feeding pretrained weights

In [7]:
from trl import setup_chat_format
from peft import prepare_model_for_kbit_training

model, tokenizer=setup_chat_format(model, tokenizer)

model=prepare_model_for_kbit_training(model)

In [8]:
model.device

device(type='cuda', index=0)

# Loading Dataset

In [9]:
from datasets import load_dataset

# Note: if you have enough computing resource, please considering use all data for your training.
# ds=load_dataset(os.getenv('DATASET'), split='all')
# ds=ds.shuffle(seed=42).select(range(1000))


ds=load_dataset(os.getenv('DATASET'), split='train[:200]')
ds

Downloading readme:   0%|          | 0.00/2.92k [00:00<?, ?B/s]

Downloading data: 100%|██████████| 115M/115M [00:00<00:00, 132MB/s]  


Generating train split:   0%|          | 0/44245 [00:00<?, ? examples/s]

Dataset({
    features: ['source', 'chosen', 'rejected', 'prompt'],
    num_rows: 200
})

In [10]:
ds=ds.shuffle(seed=42)

def format_chat_template(row):
    row["chosen"] = tokenizer.apply_chat_template(row["chosen"], tokenize=False)
    row["rejected"] = tokenizer.apply_chat_template(row["rejected"], tokenize=False)
    return row

ds=ds.map(format_chat_template, num_proc=os.cpu_count())

Map (num_proc=4):   0%|          | 0/200 [00:00<?, ? examples/s]

In [11]:
ds=ds.train_test_split(test_size=0.01)
ds

DatasetDict({
    train: Dataset({
        features: ['source', 'chosen', 'rejected', 'prompt'],
        num_rows: 198
    })
    test: Dataset({
        features: ['source', 'chosen', 'rejected', 'prompt'],
        num_rows: 2
    })
})

# Fine-tuning

We need to set a few hyperparameter for ORPO configuration.

### learning_rate

ORPO uses very low learning rates compared to traditinal SFT or even DPO. This value of 8e-6 comes from the original paper. SFT is 1e-5, DPO is 5e-6.

### beta

It is the $\lambda\$ parameter in the paper, with the default value of 0.1

### max_lengthm batch_size

Other parameters, like `max_length` and batch size are set to use as much VRAM as avaliable(~20 GB).

In [13]:
from trl import ORPOConfig, ORPOTrainer

# https://github.com/huggingface/trl/blob/v0.8.6/trl/trainer/orpo_config.py
orpo_args=ORPOConfig(
    learning_rate=8e-6,
    beta=0.1,
    lr_scheduler_type="linear",
    max_length=1024,
    max_prompt_length=512,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    optim="paged_adamw_8bit",
    num_train_epochs=1,
    evaluation_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    report_to="wandb",
    run_name=os.getenv('WANDB_NAME'),
    output_dir=os.getenv('WANDB_NAME')
)

trainer=ORPOTrainer(
    model=model,
    args=orpo_args,
    train_dataset=ds["train"],
    eval_dataset=ds["test"],
    peft_config=peft_config,
    tokenizer=tokenizer
)

trainer.train()

NameError: name 'dataset' is not defined

In [None]:
kwargs={
    'model_name': os.getenv("WANDB_NAME"),
    'finetuned_from': os.getenv('MODEL_NAME'),
#     'tasks': '',
#     'dataset_tags':'',
    'dataset': os.getenv("DATASET")
}

tokenizer.push_to_hub(os.getenv("WANDB_NAME"))
trainer.push_to_hub(**kwargs)

# Merge and push merged model

In [None]:
import gc

del trainer, model
gc.collect()

torch.cuda.empty_cache()

In [None]:
tokenizer=AutoTokenizer.from_pretrained(os.getenv('MODEL_NAME'))

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    os.getenv('MODEL_NAME'),
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="auto",
)

model.device()

In [None]:
model, tokenizer = setup_chat_format(model, tokenizer)

# Merge adapter with base model
model = PeftModel.from_pretrained(model, os.getenv("WANDB_NAME"))
model = model.merge_and_unload()

In [None]:
# model.push_to_hub(os.getenv("WANDB_NAME"), use_temp_dir=False)
# tokenizer.push_to_hub(os.getenv("WANDB_NAME"), use_temp_dir=False)