<a href="https://www.kaggle.com/code/aisuko/fine-tune-llama3-with-orpo?scriptVersionId=185163732" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Overview

ORPO is a new exciting fine-tuning technique that combines that traditional supervised fine-tuning and preference alignement stagaes into a single process. This reduces the computational resources and time required for training. Moreover, empirical results demonstrate that ORPO outperforms other alignment methods on various model size and benchmarks.

We will fine-tune the Llama 3 model 8B model using ORPO with the TRL library.

In [1]:
!pip install -U -q transformers==4.39.3
!pip install -U -q accelerate==0.28.0
!pip install -U -q datasets==2.18.0
!pip install -U -q peft==0.10.0
!pip install -U -q bitsandbytes==0.43.1
!pip install -U -q trl==0.8.6

### Note: If your env suports flash attention, be sure installed it.

In [2]:
import torch

if torch.cuda.get_device_capability()[0] >= 8:
    !pip install -qqq flash-attn
    attn_implementation = "flash_attention_2"
    torch_dtype = torch.bfloat16
else:
    attn_implementation = "eager"
    torch_dtype = torch.float16

In [3]:
import os
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
login(token=user_secrets.get_secret("HUGGINGFACE_TOKEN"))

os.environ["WANDB_API_KEY"]=user_secrets.get_secret("WANDB_API_KEY")
os.environ["WANDB_PROJECT"] = "Fine-tuning Llama 3 8B"
os.environ["WANDB_NAME"] = "ft-Llama3-8b-orpo"
os.environ["MODEL_NAME"] = "meta-llama/Meta-Llama-3-8B"
os.environ["DATASET"] = "mlabonne/orpo-dpo-mix-40k"

torch.backends.cudnn.deterministic=True
# https://github.com/huggingface/transformers/issues/28731
torch.backends.cuda.enable_mem_efficient_sdp(False)

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [4]:
!accelerate estimate-memory ${MODEL_NAME} --library_name transformers

Loading pretrained config for `meta-llama/Meta-Llama-3-8B` from `transformers`...
┌──────────────────────────────────────────────────────┐
│Memory Usage for loading `meta-llama/Meta-Llama-3-8B` │
├───────┬─────────────┬──────────┬─────────────────────┤
│ dtype │Largest Layer│Total Size│ Training using Adam │
├───────┼─────────────┼──────────┼─────────────────────┤
│float32│   1.96 GB   │ 28.21 GB │      112.83 GB      │
│float16│  1002.0 MB  │ 14.1 GB  │       56.42 GB      │
│  int8 │   501.0 MB  │ 7.05 GB  │       28.21 GB      │
│  int4 │   250.5 MB  │ 3.53 GB  │       14.1 GB       │
└───────┴─────────────┴──────────┴─────────────────────┘


# Quantization with QLoRA

In [5]:
from transformers import BitsAndBytesConfig
from peft import LoraConfig

bnb_config=BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True
#     llm_int8_enable_fp32_cpu_offload=True
)

peft_config=LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)

In [6]:
from transformers import AutoTokenizer

tokenizer=AutoTokenizer.from_pretrained(os.getenv('MODEL_NAME'))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
from transformers import AutoModelForCausalLM

model=AutoModelForCausalLM.from_pretrained(
    os.getenv('MODEL_NAME'),
    quantization_config=bnb_config,
    # https://github.com/huggingface/trl/issues/1571#issuecomment-2075404536
    # https://github.com/xfactlab/orpo/issues/18
    device_map={"":0},
    torch_dtype=torch_dtype
#     attn_implementation=attn_implementation
)

model.device

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

device(type='cuda', index=0)

In [8]:
def print_trainable_parameters(model):
    trainable_params=0
    all_params=0
    for _, param in model.named_parameters():
        all_params+=param.numel()
        if param.requires_grad:
            trainable_params+=param.numel()
    print(f"trainable params: {trainable_params} || all params: {all_params} || trainable%: {100 * trainable_params/all_params:.2f}")

print_trainable_parameters(model)

trainable params: 1050939392 || all params: 4540600320 || trainable%: 23.15


# Set chat format and feeze pretrained weights

In [9]:
from trl import setup_chat_format
from peft import prepare_model_for_kbit_training

model, tokenizer=setup_chat_format(model, tokenizer)

model=prepare_model_for_kbit_training(model)

In [10]:
print_trainable_parameters(model)

trainable params: 0 || all params: 4540616704 || trainable%: 0.00


# Loading Dataset

In [11]:
from datasets import load_dataset

# Note: if you have enough computing resource, please considering use all data for your training.
# ds=load_dataset(os.getenv('DATASET'), split='all')
# ds=ds.shuffle(seed=42).select(range(1000))


ds=load_dataset(os.getenv('DATASET'), split='train[:300]')
ds

Dataset({
    features: ['source', 'chosen', 'rejected', 'prompt', 'question'],
    num_rows: 300
})

In [12]:
ds=ds.shuffle(seed=42)

def format_chat_template(row):
    row["chosen"] = tokenizer.apply_chat_template(row["chosen"], tokenize=False)
    row["rejected"] = tokenizer.apply_chat_template(row["rejected"], tokenize=False)
    return row

ds=ds.map(format_chat_template, num_proc=os.cpu_count())

In [13]:
ds=ds.train_test_split(test_size=0.01)
ds

DatasetDict({
    train: Dataset({
        features: ['source', 'chosen', 'rejected', 'prompt', 'question'],
        num_rows: 297
    })
    test: Dataset({
        features: ['source', 'chosen', 'rejected', 'prompt', 'question'],
        num_rows: 3
    })
})

# Fine-tuning

We need to set a few hyperparameter for ORPO configuration.

### learning_rate

ORPO uses very low learning rates compared to traditinal SFT or even DPO. This value of 8e-6 comes from the original paper. SFT is 1e-5, DPO is 5e-6.

### beta

It is the $\lambda\$ parameter in the paper, with the default value of 0.1

### max_lengthm batch_size

Other parameters, like `max_length` and batch size are set to use as much VRAM as avaliable(~20 GB).

In [14]:
from trl import ORPOConfig, ORPOTrainer

# https://github.com/huggingface/trl/blob/v0.8.6/trl/trainer/orpo_config.py
orpo_args=ORPOConfig(
    learning_rate=8e-6,
    beta=0.1,
    lr_scheduler_type="linear",
    max_length=1024,
    max_prompt_length=512,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    optim="paged_adamw_8bit",
    num_train_epochs=1,
    evaluation_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    report_to="wandb",
    run_name=os.getenv('WANDB_NAME'),
    output_dir=os.getenv('WANDB_NAME')
)

trainer=ORPOTrainer(
    model=model,
    args=orpo_args,
    train_dataset=ds["train"],
    eval_dataset=ds["test"],
    peft_config=peft_config,
    tokenizer=tokenizer
)

trainer.train()

2024-06-24 07:35:35.517796: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-24 07:35:35.517921: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-24 07:35:35.644188: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Map:   0%|          | 0/297 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
[34m[1mwandb[0m: Currently logged in as: [33murakiny[0m ([33mcausal_language_trainer[0m). Use [1m`wandb login --relogin`[0m to force relogin




OutOfMemoryError: CUDA out of memory. Tried to allocate 1.93 GiB. GPU 0 has a total capacty of 15.89 GiB of which 644.12 MiB is free. Process 9356 has 15.26 GiB memory in use. Of the allocated memory 14.50 GiB is allocated by PyTorch, and 489.33 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
kwargs={
    'model_name': os.getenv("WANDB_NAME"),
    'finetuned_from': os.getenv('MODEL_NAME'),
#     'tasks': '',
#     'dataset_tags':'',
    'dataset': os.getenv("DATASET")
}

tokenizer.push_to_hub(os.getenv("WANDB_NAME"))
trainer.push_to_hub(**kwargs)

# Merge and push merged model

In [None]:
import gc

del trainer, model
gc.collect()

torch.cuda.empty_cache()

In [None]:
tokenizer=AutoTokenizer.from_pretrained(os.getenv('MODEL_NAME'))

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    os.getenv('MODEL_NAME'),
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="cuda",
)

model.device()

In [None]:
model, tokenizer = setup_chat_format(model, tokenizer)

# Merge adapter with base model
model = PeftModel.from_pretrained(model, os.getenv("WANDB_NAME"))
model = model.merge_and_unload()

In [None]:
# model.push_to_hub(os.getenv("WANDB_NAME"), use_temp_dir=False)
# tokenizer.push_to_hub(os.getenv("WANDB_NAME"), use_temp_dir=False)

# Acknowledge

* https://www.kaggle.com/code/aisuko/fine-tuning-phi-2-with-qlora
* https://medium.com/towards-data-science/fine-tune-llama-3-with-orpo-56cfab2f9ada
* https://www.kaggle.com/code/aisuko/llm-prompt-recovery-with-gemma