# LLM Fine-tuning



## 1. Setup development environment


In [1]:
# Install Pytorch & other libraries
%pip install torch

# Install Hugging Face libraries
%pip install  --upgrade \
  transformers \
  datasets \
  accelerate \
  evaluate \
  bitsandbytes \
  trl \
  peft

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [2]:
import torch
assert torch.cuda.get_device_capability()[0] >= 8, 'Hardware not supported for Flash Attention'
# install flash-attn
!pip install ninja packaging
!pip install flash-attn --no-build-isolation

Collecting ninja
  Downloading ninja-1.11.1.4-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (5.0 kB)
Downloading ninja-1.11.1.4-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (422 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/422.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m422.8/422.8 kB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ninja
Successfully installed ninja-1.11.1.4
Collecting flash-attn
  Downloading flash_attn-2.7.4.post1.tar.gz (6.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.0/6.0 MB[0m [31m114.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: flash-attn
  Building wheel for flash-attn (setup.py) ... [?25l[?25hdone
  Created wheel for flash-attn: filename=flash_attn-2.7.4.post1-cp311-cp311-linux_x86_64.whl size=187831

In [3]:
import gc

def clear_hardwares():
    torch.clear_autocast_cache()
    torch.cuda.ipc_collect()
    torch.cuda.empty_cache()
    gc.collect()

## 2. Tokenization

In [4]:
from transformers import AutoTokenizer
import numpy as np

checkpoint = "Qwen/Qwen2.5-1.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

sequence = "I've been waiting to learn how to fine-tune LLMs my whole life."

model_inputs = tokenizer(sequence)
print(model_inputs)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

{'input_ids': [40, 3003, 1012, 8580, 311, 3960, 1246, 311, 6915, 2385, 2886, 444, 10994, 82, 847, 4361, 2272, 13], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [None]:
sequence = "Using a Transformersss network is simple"
tokens = tokenizer.tokenize(sequence)

print(tokens)

['Using', 'Ġa', 'ĠTransformers', 'ss', 'Ġnetwork', 'Ġis', 'Ġsimple']


In [None]:
ids = tokenizer.convert_tokens_to_ids(tokens)

print(ids)

[16429, 264, 80532, 778, 3922, 374, 4285]


In [None]:
decoded_string = tokenizer.decode([16429, 264, 62379, 3922, 374, 4285])
print(decoded_string)

Using a Transformer network is simple


In [None]:
messages=[
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Answer the most sacred question"},
    {"role": "assistant", "content": "42"}
]

In [None]:
print(tokenizer.apply_chat_template(messages))

[151644, 8948, 198, 2610, 525, 264, 10950, 17847, 13, 151645, 198, 151644, 872, 198, 16141, 279, 1429, 31342, 3405, 151645, 198, 151644, 77091, 198, 19, 17, 151645, 198]


In [None]:
print(tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))

<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
Answer the most sacred question<|im_end|>
<|im_start|>assistant
42<|im_end|>
<|im_start|>assistant



In [None]:
print(tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False))

<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
Answer the most sacred question<|im_end|>
<|im_start|>assistant
42<|im_end|>



## 3. Create and prepare the dataset

In our example we will use an already existing dataset called [sql-create-context](https://huggingface.co/datasets/b-mc2/sql-create-context), which contains samples of natural language instructions, schema definitions and the corresponding SQL query.

* conversational format
```json
{"messages": [{"role": "system", "content": "You are..."}, {"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]}
{"messages": [{"role": "system", "content": "You are..."}, {"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]}
{"messages": [{"role": "system", "content": "You are..."}, {"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]}
```
* instruction format

```json
{"prompt": "<prompt text>", "completion": "<ideal generated text>"}
{"prompt": "<prompt text>", "completion": "<ideal generated text>"}
{"prompt": "<prompt text>", "completion": "<ideal generated text>"}
```

In [None]:
from datasets import load_dataset

# Convert dataset to OAI messages
system_message = """You are an text to SQL query translator. Users will ask you questions in English and you will generate a SQL query based on the provided SCHEMA.
SCHEMA:
{schema}"""

def create_conversation(sample):
  return {
    "messages": [
      {"role": "system", "content": system_message.format(schema=sample["context"])},
      {"role": "user", "content": sample["question"]},
      {"role": "assistant", "content": sample["answer"]}
    ]
  }

# Load dataset from the hub
dataset = load_dataset("b-mc2/sql-create-context", split="train")
dataset = dataset.shuffle().select(range(1500))

# Convert dataset to OAI messages
dataset = dataset.map(create_conversation, remove_columns=dataset.features, batched=False)
# split dataset into 1,000 training samples and 500 test samples
dataset = dataset.train_test_split(test_size=500/1500)

print(dataset["train"][400]["messages"])

# save datasets to disk
dataset["train"].to_json("train_dataset.json", orient="records")
dataset["test"].to_json("test_dataset.json", orient="records")

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

[{'content': 'You are an text to SQL query translator. Users will ask you questions in English and you will generate a SQL query based on the provided SCHEMA.\nSCHEMA:\nCREATE TABLE table_name_82 (laps VARCHAR, points VARCHAR, car__number VARCHAR, driver VARCHAR)', 'role': 'system'}, {'content': 'How many laps did Jeff Burton have when he drove car with a # over 9 and more than 118 points?', 'role': 'user'}, {'content': 'SELECT COUNT(laps) FROM table_name_82 WHERE car__number > 9 AND driver = "jeff burton" AND points > 118', 'role': 'assistant'}]


Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

238050

## 4. Fine-tune LLM using `trl` and the `SFTTrainer`


In [None]:
from datasets import load_dataset

# Load jsonl data from disk
dataset = load_dataset("json", data_files="train_dataset.json", split="train")

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
dataset

Dataset({
    features: ['messages'],
    num_rows: 1000
})

In [None]:
dataset[0]

{'messages': [{'content': 'You are an text to SQL query translator. Users will ask you questions in English and you will generate a SQL query based on the provided SCHEMA.\nSCHEMA:\nCREATE TABLE table_18662026_1 (floor VARCHAR, parallel_bars VARCHAR)',
   'role': 'system'},
  {'content': 'If the parallel bars numbers is 61.500, what is the total number for the flood?',
   'role': 'user'},
  {'content': 'SELECT COUNT(floor) FROM table_18662026_1 WHERE parallel_bars = "61.500"',
   'role': 'assistant'}]}

In [None]:
lens = [len(tokenizer.apply_chat_template(el['messages'])) for el in dataset]

In [None]:
np.average(lens), np.max(lens), np.percentile(lens, 95)

(np.float64(107.564), np.int64(244), np.float64(136.0))

Loading the model

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Hugging Face model id
model_id = "Qwen/Qwen2.5-1.5B-Instruct"

# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    attn_implementation="flash_attention_2", # supported only for gpus A series or newer
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.padding_side = 'right' # to prevent warnings


config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

Creating the lora adapters

In [None]:
from peft import LoraConfig
# use this if you want to specify which layers
target_modules = "all-linear"
# use this if you to add lora adapters to all linear layers
target_modules = ["q_proj", "v_proj", 'k_proj']

# LoRA config based on QLoRA paper & Sebastian Raschka experiment
peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.00,
        r=16,
        bias="none",
        target_modules=target_modules,
        task_type="CAUSAL_LM",
)

Creting the training arguments

In [None]:
max_seq_length = 256 # max sequence length for model and packing of the dataset

In [None]:
from trl import SFTConfig

output_dir = "model_output"

args = SFTConfig(
    output_dir=output_dir,                  # directory to save and repository id
    num_train_epochs=1,                     # number of training epochs set this for 1 full training run.
    # max_steps=100,                          # max train steps
    per_device_train_batch_size=8,          # batch size per device during training
    gradient_accumulation_steps=2,          # number of steps before performing a backward/update pass
    gradient_checkpointing=True,            # use gradient checkpointing to save memory
    optim="adamw_torch_fused",              # use fused adamw optimizer
    logging_steps=10,                       # log every 10 steps
    save_strategy="steps",                  # save checkpoint every epoch
    save_steps=50,                          # save afrer 50 steps
    learning_rate=2e-4,                     # learning rate, based on QLoRA paper
    max_seq_length=max_seq_length,          # max length of the input
    bf16=True,                              # use bfloat16 precision
    max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,                      # warmup ratio based on QLoRA paper
    lr_scheduler_type="cosine",             # cosine learing rate cheduler
    push_to_hub=False,                       # push model to hub
    report_to="none",                       # report metrics like wandb or tensorboard
    packing=False,
    dataset_kwargs={
        "add_special_tokens": False,  # We template with special tokens
        "append_concat_token": False, # No need to add additional separator token
    }
)

We are ready to create out trainer

In [None]:
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM

collator = DataCollatorForCompletionOnlyLM("<|im_start|>assistant", tokenizer=tokenizer)

trainer = SFTTrainer(
    model=model,
    args=args,
    data_collator=collator,
    train_dataset=dataset,
    peft_config=peft_config,
    processing_class=tokenizer,
)

Converting train dataset to ChatML:   0%|          | 0/1000 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Training our model

In [None]:
# start training, the model will be automatically saved to the hub and the output directory
trainer.train()

# save model
trainer.save_model()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.bfloat16.


Step,Training Loss
10,0.2576
20,0.116
30,0.0672
40,0.0858
50,0.1122
60,0.0929


In [None]:
# free the memory
del model
del trainer
clear_hardwares()

### Merge LoRA adapter in to the original model


In [None]:
from peft import AutoPeftModelForCausalLM

# Load PEFT model on CPU
model = AutoPeftModelForCausalLM.from_pretrained(
    args.output_dir,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
)
# Merge LoRA and base model and save
merged_model = model.merge_and_unload()
merged_model.save_pretrained(args.output_dir, safe_serialization=True, max_shard_size="2GB")

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


In [None]:
# free the memory
del model
del merged_model
clear_hardwares()

## 4. Test Model and run Inference


In [None]:
import torch
from transformers import AutoTokenizer, pipeline, AutoModelForCausalLM

model_id = "./model_output"

# Load Model with PEFT adapter
model = AutoModelForCausalLM.from_pretrained(
  model_id,
  device_map="auto",
  torch_dtype=torch.float16
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
# load into pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

Device set to use cuda:0


Let’s load our test dataset try to generate an instruction.

In [None]:
from datasets import load_dataset
from random import randint


# Load our test dataset
eval_dataset = load_dataset("json", data_files="test_dataset.json", split="train")
rand_idx = randint(0, len(eval_dataset))

# Test on sample
prompt = pipe.tokenizer.apply_chat_template(eval_dataset[rand_idx]["messages"][:2], tokenize=False, add_generation_prompt=True)
outputs = pipe(prompt, max_new_tokens=256, do_sample=False, temperature=0.1, top_k=50, top_p=0.1, eos_token_id=pipe.tokenizer.eos_token_id, pad_token_id=pipe.tokenizer.pad_token_id)

print(f"Query:\n{eval_dataset[rand_idx]['messages'][1]['content']}")
print(f"Original Answer:\n{eval_dataset[rand_idx]['messages'][2]['content']}")
print(f"Generated Answer:\n{outputs[0]['generated_text'][len(prompt):].strip()}")

Generating train split: 0 examples [00:00, ? examples/s]



Query:
Who was the jockey that had post time odds of 34-1?
Original Answer:
SELECT jockey FROM table_name_59 WHERE post_time_odds = "34-1"
Generated Answer:
SELECT jockey FROM table_name_59 WHERE post_time_odds = "34-1"


In [None]:
from tqdm import tqdm


def evaluate(sample):
    prompt = pipe.tokenizer.apply_chat_template(sample["messages"][:2], tokenize=False, add_generation_prompt=True)
    # outputs = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.1, top_k=50, top_p=0.95, eos_token_id=pipe.tokenizer.eos_token_id, pad_token_id=pipe.tokenizer.pad_token_id)
    outputs = pipe(prompt, max_new_tokens=256, do_sample=False, eos_token_id=pipe.tokenizer.eos_token_id, pad_token_id=pipe.tokenizer.pad_token_id)
    predicted_answer = outputs[0]['generated_text'][len(prompt):].strip()
    if predicted_answer == sample["messages"][2]["content"]:
        return 1
    else:
        return 0

success_rate = []
number_of_eval_samples = 100
# iterate over eval dataset and predict
for s in tqdm(eval_dataset.shuffle().select(range(number_of_eval_samples))):
    success_rate.append(evaluate(s))

# compute accuracy
accuracy = sum(success_rate)/len(success_rate)

print(f"Accuracy: {accuracy*100:.2f}%")


  8%|▊         | 8/100 [00:12<02:27,  1.60s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 100/100 [02:09<00:00,  1.30s/it]

Accuracy: 58.00%





In [None]:
print(f"Accuracy: {accuracy*100:.2f}%")

Accuracy: 58.00%


Before fine-tuning the models accuracy was 0%