# Fine-Tuning a model for Function-Calling

In [2]:
!python -m venv .venv
!ls -la
!source .venv/bin/activate
!source .env

total 1800
drwxr-xr-x@ 14 aaronserpilin  staff     448 Oct  8 13:10 [34m.[m[m
drwxr-xr-x  17 aaronserpilin  staff     544 Oct  7 16:18 [34m..[m[m
-rw-r--r--@  1 aaronserpilin  staff     501 Oct  7 14:12 .env
drwxr-xr-x@ 14 aaronserpilin  staff     448 Oct  7 16:10 [34m.git[m[m
-rw-r--r--@  1 aaronserpilin  staff      24 Oct  3 15:45 .gitignore
drwxr-xr-x@  7 aaronserpilin  staff     224 Oct  8 13:10 [34m.venv[m[m
-rw-r--r--@  1 aaronserpilin  staff  843388 Oct  7 15:08 Batman_training_and_meals.png
-rw-r--r--@  1 aaronserpilin  staff     301 Oct  3 14:17 README.md
drwxr-xr-x@  3 aaronserpilin  staff      96 Oct  3 15:44 [34mcertificates[m[m
-rw-r--r--@  1 aaronserpilin  staff   10997 Oct  7 13:48 dummy_agent_library.ipynb
drwxr-xr-x@ 13 aaronserpilin  staff     416 Oct  3 16:58 [34mfirst_agent_template[m[m
-rw-r--r--@  1 aaronserpilin  staff     827 Oct  8 13:11 function_calling.ipynb
-rw-r--r--@  1 aaronserpilin  staff   29301 Oct  7 15:10 langgraph_agent.ipynb
-rw-r

In [3]:
!pip install -q -U bitsandbytes
!pip install -q -U peft
!pip install -q -U trl
!pip install -q -U tensorboardX
!pip install -q wandb
!pip install -q -U torchvision
!pip install -q -U transformers

In [22]:
from enum import Enum
from functools import partial
import pandas as pd
import torch
import json

from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig, set_seed
from datasets import load_dataset
from trl import SFTConfig, SFTTrainer
from peft import LoraConfig, TaskType

seed = 42
set_seed(seed)

## Data Processing

In [None]:
model_name = "google/gemma-2-2b-it"
dataset_name = "Jofthomas/hermes-function-calling-thinking-V1"
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer.chat_template = "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{{ '<start_of_turn>' + message['role'] + '\n' + message['content'] | trim + '<end_of_turn><eos>\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}"

def preprocess(sample): # Converts the list of messages into a prompt thee model can understand
      messages = sample["messages"]
      first_message = messages[0]

      # Instead of adding a system message, we merge the content into the first user message
      if first_message["role"] == "system":
          system_message_content = first_message["content"]
          # Merge system content with the first user message
          messages[1]["content"] = system_message_content + "Also, before making a call to a function take the time to plan the function to take. Make that thinking process between <think>{your thoughts}</think>\n\n" + messages[1]["content"]
          # Remove the system message from the conversation
          messages.pop(0)

      return {"text": tokenizer.apply_chat_template(messages, tokenize=False)}

dataset = load_dataset(dataset_name)
dataset = dataset.rename_column("conversations", "messages")

Generating train split: 100%|██████████| 3570/3570 [00:00<00:00, 212537.12 examples/s]


The `NouseResearch/hermes-function-calling-v1` dataset is considered as a reference when it comes to function calling datasets. However, while it is great, it does not include a "thinking" step. Within function calling this is optional, but recent models such as DeepSeek have proven that giving an LLM time to think before it answers can significantly improve performance. 

In [6]:
dataset = dataset.map(preprocess, remove_columns="messages")
dataset = dataset["train"].train_test_split(0.1)
print(dataset)

dataset["train"] = dataset["train"].select(range(100))
dataset["test"] = dataset["test"].select(range(100))

Map: 100%|██████████| 3570/3570 [00:00<00:00, 16759.77 examples/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 3213
    })
    test: Dataset({
        features: ['text'],
        num_rows: 357
    })
})





## Inputs

In [13]:
print(dataset["train"][8]["text"])

<bos><start_of_turn>human
You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags.You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions.Here are the available tools:<tools> [{'type': 'function', 'function': {'name': 'get_news_headlines', 'description': 'Get the latest news headlines', 'parameters': {'type': 'object', 'properties': {'country': {'type': 'string', 'description': 'The country for which headlines are needed'}}, 'required': ['country']}}}, {'type': 'function', 'function': {'name': 'search_recipes', 'description': 'Search for recipes based on ingredients', 'parameters': {'type': 'object', 'properties': {'ingredients': {'type': 'array', 'items': {'type': 'string'}, 'description': 'The list of ingredients'}}, 'required': ['ingredients']}}}] </tools>Use the following pydantic model json schema for each tool call you will make: {'title': 'FunctionCall

In [15]:
print(tokenizer.pad_token)
print(tokenizer.eos_token)

<pad>
<eos>


So far, `chat_template` preprocesses tha chats to format conversations as messages within a prompt. This segmented the conversations using `<think>`, `<tool_call>`, and `<tool_response>`. However, the tokenizer still does not treat them as whole tokens, trying to break them down into smaller pieces. To ensure the model correctly interprets the new format, we must add these tokens to our tokenizer. 

In [None]:
class ChatmlSpecialTokens (str, Enum):
    tools = "<tools>"
    eotools = "</tools>"
    think = "<think>"
    eothink = "</think>"
    tool_call="<tool_call>"
    eotool_call="</tool_call>"
    tool_response="<tool_response>"
    eotool_response="</tool_response>"
    pad_token = "<pad>"
    eos_token = "<eos>"

    @classmethod
    def list(cls):
        return [c.value for c in cls]

tokenizer = AutoTokenizer.from_pretrained(
    model_name, 
    pad_token=ChatmlSpecialTokens.pad_token.value,
    additional_special_tokens=ChatmlSpecialTokens.list()
)

tokenizer.chat_template = "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{{ '<start_of_turn>' + message['role'] + '\n' + message['content'] | trim + '<end_of_turn><eos>\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}"

model = AutoModelForCausalLM.from_pretrained(model_name,
                                             attn_implementation='eager',
                                             device_map="cuda")
model.resize_token_embeddings(len(tokenizer))
model.to(torch.bfloat16) # when converting precision, important to make sure all w&bs are offloaded to the same device

Loading checkpoint shards: 100%|██████████| 2/2 [00:18<00:00,  9.41s/it]


Gemma2ForCausalLM(
  (model): Gemma2Model(
    (embed_tokens): Embedding(256008, 2304, padding_idx=0)
    (layers): ModuleList(
      (0-25): 26 x Gemma2DecoderLayer(
        (self_attn): Gemma2Attention(
          (q_proj): Linear(in_features=2304, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2304, out_features=1024, bias=False)
          (v_proj): Linear(in_features=2304, out_features=1024, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2304, bias=False)
        )
        (mlp): Gemma2MLP(
          (gate_proj): Linear(in_features=2304, out_features=9216, bias=False)
          (up_proj): Linear(in_features=2304, out_features=9216, bias=False)
          (down_proj): Linear(in_features=9216, out_features=2304, bias=False)
          (act_fn): GELUTanh()
        )
        (input_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)
        (post_attention_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)
        (pre_feedforward_layernorm): Gemma2RMSNo

## LoRA Configuration

In [19]:
from peft import LoraConfig

rank_dimension = 16 # rank dimension for LoRA update matrices (smaller = more compression)
lora_alpha = 64 # scaling factor for LoRA layers (higher = stronger adaptation)
lora_dropout = 0.05 # dropout probability for LoRA layers to help prevent overfitting

peft_config = LoraConfig(r=rank_dimension,
                        lora_alpha=lora_alpha,
                        lora_dropout=lora_dropout,
                        target_modules=["gate_proj","q_proj","lm_head","o_proj","k_proj","embed_tokens","down_proj","up_proj","v_proj"],
                        task_type=TaskType.CAUSAL_LM
)

peft_config

LoraConfig(task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, inference_mode=False, r=16, target_modules={'gate_proj', 'lm_head', 'k_proj', 'q_proj', 'v_proj', 'up_proj', 'embed_tokens', 'o_proj', 'down_proj'}, exclude_modules=None, lora_alpha=64, lora_dropout=0.05, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', trainable_token_indices=None, loftq_config={}, eva_config=None, corda_config=None, use_dora=False, use_qalora=False, qalora_group_size=16, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False, target_parameters=None)

## Trainer and Fine-Tuning Hyperparameters

In [24]:
username="aaron-ser"
output_dir = "gemma-2-2B-it-thinking-function_calling-V0" # default model name, and where the checkpoints, logs, and other artifacts will be saved
per_device_train_batch_size = 1
per_device_eval_batch_size = 1
gradient_accumulation_steps = 4
logging_steps = 5
learning_rate = 1e-4 # The initial learning rate for the optimizer.

max_grad_norm = 1.0
num_train_epochs=1
warmup_ratio = 0.1
lr_scheduler_type = "cosine"
max_length = 1500

training_arguments = SFTConfig(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    save_strategy="no",
    eval_strategy="epoch",
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    max_grad_norm=max_grad_norm,
    weight_decay=0.1,
    warmup_ratio=warmup_ratio,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard",
    bf16=True,
    hub_private_repo=False,
    push_to_hub=False,
    num_train_epochs=num_train_epochs,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    packing=True,
    max_length=max_length
)

In [25]:
trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    processing_class=tokenizer,
    peft_config=peft_config
)

  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


Padding-free training is enabled, but the attention implementation is not set to a supported flash attention variant. Padding-free training flattens batches into a single sequence, and only the following implementations are known to reliably support this: flash_attention_2, flash_attention_3, kernels-community/flash-attn, kernels-community/flash-attn3, kernels-community/vllm-flash-attn3. Using other implementations may lead to unexpected behavior. To ensure compatibility, set `attn_implementation` in the model configuration to one of these supported options or verify that your attention mechanism can handle flattened sequences.
You are using packing, but the attention implementation is not set to a supported flash attention variant. Packing gathers multiple samples into a single sequence, and only the following implementations are known to reliably support this: flash_attention_2, flash_attention_3, kernels-community/flash-attn, kernels-community/flash-attn3, kernels-community/vllm-fla

In [None]:
trainer.train()
trainer.save_model()

## Pushing the Model and Tokenizer to the hub

In [None]:
trainer.push_to_hub(f"{username}/{output_dir}")

In [None]:
tokenizer.eos_token = "<eos>"
tokenizer.push_to_hub(f"{username}/{output_dir}", token=True)

## Model Testing

In [None]:
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from datasets import load_dataset
import torch

bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
            bnb_4bit_use_double_quant=True,
        )

peft_model_id = f"{username}/{output_dir}" 
device = "cuda"
config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path,
                                             device_map=device,
                                             )
tokenizer = AutoTokenizer.from_pretrained(peft_model_id)
model.resize_token_embeddings(len(tokenizer))
model = PeftModel.from_pretrained(model, peft_model_id)
model.to(torch.bfloat16)
model.eval()

In [None]:
print(dataset["test"][8]["text"])

In [None]:
prompt="""<bos><start_of_turn>human
You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags.You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions.Here are the available tools:<tools> [{'type': 'function', 'function': {'name': 'convert_currency', 'description': 'Convert from one currency to another', 'parameters': {'type': 'object', 'properties': {'amount': {'type': 'number', 'description': 'The amount to convert'}, 'from_currency': {'type': 'string', 'description': 'The currency to convert from'}, 'to_currency': {'type': 'string', 'description': 'The currency to convert to'}}, 'required': ['amount', 'from_currency', 'to_currency']}}}, {'type': 'function', 'function': {'name': 'calculate_distance', 'description': 'Calculate the distance between two locations', 'parameters': {'type': 'object', 'properties': {'start_location': {'type': 'string', 'description': 'The starting location'}, 'end_location': {'type': 'string', 'description': 'The ending location'}}, 'required': ['start_location', 'end_location']}}}] </tools>Use the following pydantic model json schema for each tool call you will make: {'title': 'FunctionCall', 'type': 'object', 'properties': {'arguments': {'title': 'Arguments', 'type': 'object'}, 'name': {'title': 'Name', 'type': 'string'}}, 'required': ['arguments', 'name']}For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
<tool_call>
{tool_call}
</tool_call>Also, before making a call to a function take the time to plan the function to take. Make that thinking process between <think>{your thoughts}</think>

Hi, I need to convert 500 USD to Euros. Can you help me with that?<end_of_turn><eos>
<start_of_turn>model
<think>"""

inputs = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
inputs = {k: v.to("cuda") for k,v in inputs.items()}
outputs = model.generate(**inputs,
                         max_new_tokens=300,# Adapt as necessary
                         do_sample=True,
                         top_p=0.95,
                         temperature=0.01,
                         repetition_penalty=1.0,
                         eos_token_id=tokenizer.eos_token_id)
print(tokenizer.decode(outputs[0]))