modified from https://huggingface.co/agents-course/notebooks/blob/main/bonus-unit1/bonus-unit1.ipynb

In [1]:
from enum import Enum
from functools import partial
import pandas as pd
import torch
import json

from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
from datasets import load_dataset
from trl import SFTConfig, SFTTrainer
from peft import LoraConfig, TaskType

seed = 42
set_seed(seed)

import os

In [2]:
model_name = "google/gemma-2-2b-it"
dataset_name = "Jofthomas/hermes-function-calling-thinking-V1"
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer.chat_template = "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{{ '<start_of_turn>' + message['role'] + '\n' + message['content'] | trim + '<end_of_turn><eos>\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}"


def preprocess(sample):
    messages = sample["messages"]
    first_message = messages[0]

    # Instead of adding a system message, we merge the content into the first user message
    if first_message["role"] == "system":
        system_message_content = first_message["content"]
        # Merge system content with the first user message
        messages[1]["content"] = (
            system_message_content
            + "Also, before making a call to a function take the time to plan the function to take. Make that thinking process between <think>{your thoughts}</think>\n\n"
            + messages[1]["content"]
        )
        # Remove the system message from the conversation
        messages.pop(0)

    return {"text": tokenizer.apply_chat_template(messages, tokenize=False)}


dataset = load_dataset(dataset_name)
dataset = dataset.rename_column("conversations", "messages")

In [3]:
# train test split
dataset = dataset.map(preprocess, remove_columns="messages")
dataset = dataset["train"].train_test_split(0.1)
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 3213
    })
    test: Dataset({
        features: ['text'],
        num_rows: 357
    })
})


Eyeballing

- A User message containing the necessary information with the list of available tools inbetween <tools></tools>
- An Assistant message here called "model" to fit the criterias from gemma models containing two new phases, a "thinking" phase contained in <think></think> and an "Act" phase contained in <tool_call></<tool_call>.
- If the model contains a <tools_call>, we will append the result of this action in a new "Tool" message containing a <tool_response></tool_response> with the answer from the tool.

In [4]:
# Let's look at how we formatted the dataset
print(dataset["train"][8]["text"])

<bos><start_of_turn>human
You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags.You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions.Here are the available tools:<tools> [{'type': 'function', 'function': {'name': 'get_news_headlines', 'description': 'Get the latest news headlines', 'parameters': {'type': 'object', 'properties': {'country': {'type': 'string', 'description': 'The country for which headlines are needed'}}, 'required': ['country']}}}, {'type': 'function', 'function': {'name': 'search_recipes', 'description': 'Search for recipes based on ingredients', 'parameters': {'type': 'object', 'properties': {'ingredients': {'type': 'array', 'items': {'type': 'string'}, 'description': 'The list of ingredients'}}, 'required': ['ingredients']}}}] </tools>Use the following pydantic model json schema for each tool call you will make: {'title': 'FunctionCall

In [5]:
# Sanity check
print(tokenizer.pad_token)
print(tokenizer.eos_token)

<pad>
<eos>


Modify the Tokenizer

In [6]:
class ChatmlSpecialTokens(str, Enum):
    tools = "<tools>"
    eotools = "</tools>"
    think = "<think>"
    eothink = "</think>"
    tool_call = "<tool_call>"
    eotool_call = "</tool_call>"
    tool_response = "<tool_reponse>"
    eotool_response = "</tool_reponse>"
    pad_token = "<pad>"
    eos_token = "<eos>"

    @classmethod
    def list(cls):
        return [c.value for c in cls]


tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    pad_token=ChatmlSpecialTokens.pad_token.value,
    additional_special_tokens=ChatmlSpecialTokens.list(),
)
tokenizer.chat_template = "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{{ '<start_of_turn>' + message['role'] + '\n' + message['content'] | trim + '<end_of_turn><eos>\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}"

model = AutoModelForCausalLM.from_pretrained(
    model_name, attn_implementation="eager", device_map="auto"
)
model.resize_token_embeddings(len(tokenizer))
model.to(torch.bfloat16)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Gemma2ForCausalLM(
  (model): Gemma2Model(
    (embed_tokens): Embedding(256008, 2304, padding_idx=0)
    (layers): ModuleList(
      (0-25): 26 x Gemma2DecoderLayer(
        (self_attn): Gemma2Attention(
          (q_proj): Linear(in_features=2304, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2304, out_features=1024, bias=False)
          (v_proj): Linear(in_features=2304, out_features=1024, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2304, bias=False)
        )
        (mlp): Gemma2MLP(
          (gate_proj): Linear(in_features=2304, out_features=9216, bias=False)
          (up_proj): Linear(in_features=2304, out_features=9216, bias=False)
          (down_proj): Linear(in_features=9216, out_features=2304, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)
        (post_attention_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)
        (pre_feedforward_layernorm): Gemm

## LoRA configuration

In [7]:
from peft import LoraConfig

# TODO: Configure LoRA parameters
# r: rank dimension for LoRA update matrices (smaller = more compression)
rank_dimension = 16
# lora_alpha: scaling factor for LoRA layers (higher = stronger adaptation)
lora_alpha = 64
# lora_dropout: dropout probability for LoRA layers (helps prevent overfitting)
lora_dropout = 0.05

peft_config = LoraConfig(
    r=rank_dimension,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    target_modules=[
        "gate_proj",
        "q_proj",
        "lm_head",
        "o_proj",
        "k_proj",
        "embed_tokens",
        "down_proj",
        "up_proj",
        "v_proj",
    ],  # wich layer in the transformers do we target ?
    task_type=TaskType.CAUSAL_LM,
)

## define the Trainer and the Fine-Tuning hyperparameters

In [8]:
username = "CoralLeiCN"  # REPLCAE with your Hugging Face username
# The directory where the trained model checkpoints, logs, and other artifacts will be saved.
# It will also be the default name of the model when pushed to the hub if not redefined later.
output_dir = "gemma-2-2B-it-thinking-function_calling-V0"
per_device_train_batch_size = 1
per_device_eval_batch_size = 1
gradient_accumulation_steps = 4
logging_steps = 5
learning_rate = 1e-4  # The initial learning rate for the optimizer.

max_grad_norm = 1.0
num_train_epochs = 1
warmup_ratio = 0.1
lr_scheduler_type = "cosine"
max_seq_length = 1500

training_arguments = SFTConfig(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    save_strategy="no",
    eval_strategy="epoch",
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    max_grad_norm=max_grad_norm,
    weight_decay=0.1,
    warmup_ratio=warmup_ratio,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard",
    bf16=True,
    hub_private_repo=False,
    push_to_hub=False,
    num_train_epochs=num_train_epochs,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    packing=True,
    max_seq_length=max_seq_length,
)

In [11]:
trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    processing_class=tokenizer,
    peft_config=peft_config,
)

In [12]:
trainer.train()
trainer.save_model()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Epoch,Training Loss,Validation Loss
0,0.2797,0.29318


The 'batch_size' argument of HybridCache is deprecated and will be removed in v4.49. Use the more precisely named 'max_batch_size' argument instead.
The 'batch_size' attribute of HybridCache is deprecated and will be removed in v4.49. Use the more precisely named 'self.max_batch_size' attribute instead.
The 'batch_size' attribute of HybridCache is deprecated and will be removed in v4.49. Use the more precisely named 'self.max_batch_size' attribute instead.


push to hub

In [17]:
trainer.push_to_hub(f"{username}/{output_dir}")



adapter_model.safetensors:   0%|          | 0.00/2.48G [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/34.4M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.62k [00:00<?, ?B/s]

events.out.tfevents.1740495779.Xiaoyu:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/CoralLeiCN/gemma-2-2B-it-thinking-function_calling-V0/commit/2046dd8fc4ed6ff19c49c2a0ac6c4106a70c5537', commit_message='CoralLeiCN/gemma-2-2B-it-thinking-function_calling-V0', commit_description='', oid='2046dd8fc4ed6ff19c49c2a0ac6c4106a70c5537', pr_url=None, repo_url=RepoUrl('https://huggingface.co/CoralLeiCN/gemma-2-2B-it-thinking-function_calling-V0', endpoint='https://huggingface.co', repo_type='model', repo_id='CoralLeiCN/gemma-2-2B-it-thinking-function_calling-V0'), pr_revision=None, pr_num=None)

In [18]:
tokenizer.eos_token = "<eos>"
# push the tokenizer to hub ( replace with your username and your previously specified
tokenizer.push_to_hub(f"{username}/{output_dir}", token=True)

README.md:   0%|          | 0.00/1.54k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/CoralLeiCN/gemma-2-2B-it-thinking-function_calling-V0/commit/3f5a2f75b507688134cdc738293622b725f927fa', commit_message='Upload tokenizer', commit_description='', oid='3f5a2f75b507688134cdc738293622b725f927fa', pr_url=None, repo_url=RepoUrl('https://huggingface.co/CoralLeiCN/gemma-2-2B-it-thinking-function_calling-V0', endpoint='https://huggingface.co', repo_type='model', repo_id='CoralLeiCN/gemma-2-2B-it-thinking-function_calling-V0'), pr_revision=None, pr_num=None)