In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [2]:
from huggingface_hub import login
token = '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'
login(token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [3]:
from datasets import load_dataset

In [50]:
dataset = load_dataset("Salesforce/xlam-function-calling-60k")

In [51]:
dataset = dataset['train']

We have two subtasks
1. Function Name Detection
2. Parameter-Value Pair Detection

The model should also be able to:
1. Construct full function call
2. Predict the number of required function calls


The dataset will be splitted as follows:
- 20% Function Name Detection
- 35% Parameter-Value Pair Detection
- 45% Full Function Call

In [52]:
dataset

Dataset({
    features: ['query', 'id', 'answers', 'tools'],
    num_rows: 60000
})

In [53]:
# Shuffle the dataset
dataset = dataset.shuffle(seed=42)

In [54]:
split_ratios = [0.2, 0.35, 0.45]

In [55]:
from datasets import DatasetDict

In [56]:
split_1 = dataset.train_test_split(test_size=split_ratios[0])
function_name_detection = split_1['test']
remaining_data = split_1['train']

# Then, split the remaining data for Parameter-Value Pair Detection (35% of the original)
split_2 = remaining_data.train_test_split(test_size=split_ratios[1] / (1 - split_ratios[0]))
parameter_value_pair_detection = split_2['test']
full_function_call = split_2['train']  # Remaining 45% goes to Full Function Call

# Combine into a DatasetDict for easy access
dataset = DatasetDict({
    'function_name_detection': function_name_detection,
    'parameter_value_pair_detection': parameter_value_pair_detection,
    'full_function_call': full_function_call
})

In [57]:
dataset

DatasetDict({
    function_name_detection: Dataset({
        features: ['query', 'id', 'answers', 'tools'],
        num_rows: 12000
    })
    parameter_value_pair_detection: Dataset({
        features: ['query', 'id', 'answers', 'tools'],
        num_rows: 21000
    })
    full_function_call: Dataset({
        features: ['query', 'id', 'answers', 'tools'],
        num_rows: 27000
    })
})

In [58]:
dataset['function_name_detection'][0]['answers']

'[{"name": "numerical_derivative", "arguments": {"function": "lambda x: x ** 2 + 3 * x + 2", "x": 2}}]'

In [59]:
import json

In [60]:
def generate_function_name_detection_prompt(example):

  tools = example['tools']
  query = example['query']
  answers = json.loads(example['answers'])

  # extract the function names from the answers
  answer = [ans['name'] for ans in answers]

  answer = [
      {
          'name': name,
          'arguments': {}
      }
      for name in answer
  ]

  prompt = f"""<|im_start|>system\nYou are a helpful assistant with tool calling capabilities.
Given the following functions, Your tasks is to generate the sequence of function calls necessary to generate response to the usery query.
Your output should contain only the function names, and should be formatted as a JSON list as follows:
[
    {{'name': 'function_name_1', 'arguments': {{}}}},
    {{'name': 'function_name_2', 'arguments': {{}}}},
    ...
]


Available Tools:
{tools}
<|im_end|>
<|im_start|>user
{query}
<|im_end|>
<|im_start|>assistant
{answer}
<|im_end|>
"""
  return prompt

In [62]:
def generate_parameter_value_pair_detection_prompt(example):
    tools = example['tools']
    query = example['query']
    answers = json.loads(example['answers'])

    # extract the function names from the answers
    function_names = [ans['name'] for ans in answers]

    prompt = f"""<|im_start|>system\nYou are a helpful assistant with tool calling capabilities.
Given the following functions, You will be given a user query and a list of functions names, Your task is to find all the necessary arguments and their values for each function name from the user query and put them in their correct format.
Your output should be in the following format:
[
    {{'name': 'function_name_1', 'arguments': {{'argument_name_1': 'argument_value_1', 'argument_name_2': 'argument_value_2'}}}},
    {{'name': 'function_name_2', 'arguments': {{'argument_name_1': 'argument_value_1', 'argument_name_2': 'argument_value_2'}}}},
    ...
]

Available Tools:
{tools}
<|im_end|>
<|im_start|>user
Query: {query}
Functions names: {function_names}
<|im_end|>
<|im_start|>assistant
{answers}
<|im_end|>
"""
    return prompt

In [63]:
def generate_full_function_call_prompt(examples):
    query = examples['query']
    tools = examples['tools']
    answer = examples['answers']

    prompt = f"""<|im_start|>systemn\nYou are a helpful assistant with tool calling capabilities.
Given the following functions, please respond with a JSON for a function call (or mutliple function calls) with proper arguments that best answers the given prompt.
The function call should be formatted as: {{"name": function name, "arguments": dictionary of argument name and its value}}. Do not use variables.
Available Tools:
{tools}
<|im_end|>\n<|im_start|>user\n{query}<|im_end|>
<|im_start|>assistant\n{answer}<|im_end|>
"""

    prompt = prompt.strip()

    return prompt

In [64]:
dataset_dict = dataset

In [65]:
from datasets import concatenate_datasets

In [66]:
# Add the 'task' column to each dataset split
dataset_dict['function_name_detection'] = dataset_dict['function_name_detection'].map(lambda x: {'task': 'function_name_detection'})
dataset_dict['parameter_value_pair_detection'] = dataset_dict['parameter_value_pair_detection'].map(lambda x: {'task': 'parameter_value_pair_detection'})
dataset_dict['full_function_call'] = dataset_dict['full_function_call'].map(lambda x: {'task': 'full_function_call'})

# Concatenate all splits into one dataset
dataset = concatenate_datasets([
    dataset_dict['function_name_detection'],
    dataset_dict['parameter_value_pair_detection'],
    dataset_dict['full_function_call']
])

Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

Map:   0%|          | 0/21000 [00:00<?, ? examples/s]

Map:   0%|          | 0/27000 [00:00<?, ? examples/s]

In [67]:
dataset

Dataset({
    features: ['query', 'id', 'answers', 'tools', 'task'],
    num_rows: 60000
})

In [68]:
def make_prompt(example):
    task = example['task']
    if task == 'function_name_detection':
        prompt = generate_function_name_detection_prompt(example)
    elif task == 'parameter_value_pair_detection':
        prompt = generate_parameter_value_pair_detection_prompt(example)
    elif task == 'full_function_call':
        prompt = generate_full_function_call_prompt(example)

    return {
        'prompt': prompt
    }

In [69]:
dataset = dataset.map(make_prompt)

Map:   0%|          | 0/60000 [00:00<?, ? examples/s]

In [70]:
dataset

Dataset({
    features: ['query', 'id', 'answers', 'tools', 'task', 'prompt'],
    num_rows: 60000
})

In [71]:
dataset.push_to_hub("0xayman/function-calling-multi-task-dataset")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/60 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/449 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/0xayman/function-calling-multi-task-dataset/commit/488050cd05639209ea6de03b8fdc7559b30c15d0', commit_message='Upload dataset', commit_description='', oid='488050cd05639209ea6de03b8fdc7559b30c15d0', pr_url=None, pr_revision=None, pr_num=None)