In [1]:
from datasets import load_dataset
ds = load_dataset("Salesforce/xlam-function-calling-60k")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ds

DatasetDict({
    train: Dataset({
        features: ['query', 'id', 'answers', 'tools'],
        num_rows: 60000
    })
})

In [3]:
import ast
tools = ast.literal_eval(ds['train'][10]['tools'])

In [4]:
ans = ds['train'][10]['answers']

In [5]:
ast.literal_eval(ans)

[{'name': 'poor_backlinks', 'arguments': {'domain': 'example.com'}},
 {'name': 'qrcode', 'arguments': {'data': 'Visit our website at example.com'}}]

In [6]:
ds['train'][0]

{'query': 'Where can I find live giveaways for beta access and games?',
 'id': 0,
 'answers': '[{"name": "live_giveaways_by_type", "arguments": {"type": "beta"}}, {"name": "live_giveaways_by_type", "arguments": {"type": "game"}}]',
 'tools': '[{"name": "live_giveaways_by_type", "description": "Retrieve live giveaways from the GamerPower API based on the specified type.", "parameters": {"type": {"description": "The type of giveaways to retrieve (e.g., game, loot, beta).", "type": "str", "default": "game"}}}]'}

In [7]:
ds['train'][0]['tools'][1:-1]

'{"name": "live_giveaways_by_type", "description": "Retrieve live giveaways from the GamerPower API based on the specified type.", "parameters": {"type": {"description": "The type of giveaways to retrieve (e.g., game, loot, beta).", "type": "str", "default": "game"}}}'

In [8]:
answer = ast.literal_eval(ds['train'][0]['answers'])
def _convert_answer(answer):
    python_output = answer['name'] +"("
    for k,v in answer['arguments'].items():
        python_output += f"{k}={v},"
    python_output = python_output[:-1]
    python_output += ")"
    return python_output
_convert_answer(answer[0])

'live_giveaways_by_type(type=beta)'

In [9]:
answer

[{'name': 'live_giveaways_by_type', 'arguments': {'type': 'beta'}},
 {'name': 'live_giveaways_by_type', 'arguments': {'type': 'game'}}]

In [10]:
from unsloth import FastLanguageModel, get_chat_template
import torch

dtype = torch.bfloat16 
load_in_4bit = False 
model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "unsloth/Llama-3.2-1B-Instruct",
        # max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
        trust_remote_code=True
    )
tokenizer = get_chat_template(
        tokenizer,
        chat_template = "llama-3.1",
    )



🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
Are you certain you want to do remote code execution?
==((====))==  Unsloth 2024.9.post4: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA A100 80GB PCIe. Max memory: 79.325 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


In [11]:
ds['train'][0]

{'query': 'Where can I find live giveaways for beta access and games?',
 'id': 0,
 'answers': '[{"name": "live_giveaways_by_type", "arguments": {"type": "beta"}}, {"name": "live_giveaways_by_type", "arguments": {"type": "game"}}]',
 'tools': '[{"name": "live_giveaways_by_type", "description": "Retrieve live giveaways from the GamerPower API based on the specified type.", "parameters": {"type": {"description": "The type of giveaways to retrieve (e.g., game, loot, beta).", "type": "str", "default": "game"}}}]'}

In [14]:
from typing import Literal
import yaml
import sys
import json
def process_xlam_data(example_list, json_or_yaml:Literal['json','yaml']):
    prompts = []
    queries = example_list['query']
    answers = example_list['answers']
    # print(answers)
    tools = example_list['tools']
    for i in range(len(queries)):
        try:
            try:
                answers[i] = answers[i].replace("true","True")
                answers[i] = answers[i].replace("false","False")
                answers[i] = answers[i].replace("null","None")
                # answers[i] = answers[i].decode('utf-8',errors="replace")
                answer = _convert_answer(ast.literal_eval(answers[i])[0])
                # print("Correct: ",answers[i])
            except Exception as e:
                print(e)
                print("Error: ", answers[i])
                # sys.exit(0)
            if json_or_yaml == 'json':
                functions = tools[i][1:-1]
            elif json_or_yaml == 'yaml':
                functions = json_to_yaml(tools[i])
            prompts.append(
                tokenizer.apply_chat_template(_create_messages(queries[i],functions,answer),tokenize=False)
            )
        except UnicodeDecodeError as e:
            continue
    return {"prompt":prompts,}
    
    
def _convert_answer(answer):
    # print(answer)
    python_output = answer['name'] +"("
    for k,v in answer['arguments'].items():
        python_output += f"{k}={v},"
    python_output = python_output[:-1]
    python_output += ")"
    return python_output 

def json_to_yaml(data):
    curr_func_yaml = ""
    json_func = ast.literal_eval(data)
    for func in json_func:
        curr_func_yaml+=yaml.dump(ast.literal_eval(func)) + "\n\n"
    return curr_func_yaml

def _create_messages(user_prompt:str,functions:str, output:str):
    messages = [
            {
                "role": "system",
                "content": "You are an expert in composing functions. You are given a question and a set of possible functions."
                            " Based on the question, you will need to make one or more function/tool calls to achieve the purpose."
                            " If none of the function can be used, point it out. If the given question lacks the parameters required by the function, also point it out. You should only return the function call in tools call sections.\n",
            },
            {"role": "user", "content": f"#### Question: {user_prompt}Here is a list of functions in JSON or YAML format that you can invoke:\n{functions}. Should you decide to return the function call(s), NO other text MUST be included.\n#### Response:"},
            {"role":"assistant","content":output}
        ]
    return messages

In [15]:
tokenized_ds = ds.map(process_xlam_data,batched=True,fn_kwargs={'json_or_yaml':'json'})

Map:  77%|███████▋  | 46000/60000 [00:05<00:01, 8381.39 examples/s] 


UnicodeEncodeError: 'utf-8' codec can't encode characters in position 1386-1389: surrogates not allowed

In [9]:
import ast
import json
ast.literal_eval(ds['train'][10]['tools'])[0]

{'name': 'poor_backlinks',
 'description': 'Fetch poor quality backlinks for a given domain using the Best Backlink Checker API.',
 'parameters': {'domain': {'description': 'The domain for which to fetch the poor quality backlinks.',
   'type': 'str',
   'default': 'getecz.com'}}}

In [8]:
ds['train'][0]['answers']

'[{"name": "live_giveaways_by_type", "arguments": {"type": "beta"}}, {"name": "live_giveaways_by_type", "arguments": {"type": "game"}}]'

In [8]:
import ast

def process_ast_node(node):
    # Check if the node is a function call
    if isinstance(node, ast.Call):
        # Return a string representation of the function call
        return ast.unparse(node) 
    else:
        # Convert the node to source code and evaluate to get the value
        node_str = ast.unparse(node)
        return eval(node_str)

        
def parse_python_function_call(call_str):
    tree = ast.parse(call_str)
    expr = tree.body[0]

    call_node = expr.value
    function_name = (
        call_node.func.id
        if isinstance(call_node.func, ast.Name)
        else str(call_node.func)
    )

    parameters = {}
    noNameParam = []

    # Process positional arguments
    for arg in call_node.args:
        noNameParam.append(process_ast_node(arg))

    # Process keyword arguments
    for kw in call_node.keywords:
        parameters[kw.arg] = process_ast_node(kw.value)

    if noNameParam:
        parameters["None"] = noNameParam
        
    function_dict = {"name": function_name, "arguments": parameters}
    return function_dict

if __name__ == "__main__":
    call_str = "func(1, [1, 2], 3, a=4, b=5)"
    print(parse_python_function_call(call_str))

    call_str = "func('cde', x=1, b='2', c=[1, 2, {'a': 1, 'b': 2}])"
    print(parse_python_function_call(call_str))

    call_str = "get_current_weather(location='Boston, MA', api_key=123456789, unit='fahrenheit')"
    print(parse_python_function_call(call_str))

{'name': 'func', 'arguments': {'a': 4, 'b': 5, 'None': [1, [1, 2], 3]}}
{'name': 'func', 'arguments': {'x': 1, 'b': '2', 'c': [1, 2, {'a': 1, 'b': 2}], 'None': ['cde']}}
{'name': 'get_current_weather', 'arguments': {'location': 'Boston, MA', 'api_key': 123456789, 'unit': 'fahrenheit'}}


In [1]:
from datasets import load_dataset
 
# Convert dataset to OAI messages
system_message = """You are an text to SQL query translator. Users will ask you questions in English and you will generate a SQL query based on the provided SCHEMA.
SCHEMA:
{schema}"""
 
def create_conversation(sample):
  return {
    "messages": [
      {"role": "system", "content": system_message.format(schema=sample["context"])},
      {"role": "user", "content": sample["question"]},
      {"role": "assistant", "content": sample["answer"]}
    ]
  }
 
# Load dataset from the hub
dataset = load_dataset("b-mc2/sql-create-context", split="train")
dataset = dataset.shuffle().select(range(12500))
 
# Convert dataset to OAI messages
dataset = dataset.map(create_conversation, remove_columns=dataset.features,batched=False)
# split dataset into 10,000 training samples and 2,500 test samples
dataset = dataset.train_test_split(test_size=2500/12500)
 
print(dataset["train"][345]["messages"])


  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 100%|██████████| 78577/78577 [00:00<00:00, 244032.27 examples/s]
Map: 100%|██████████| 12500/12500 [00:01<00:00, 9794.84 examples/s] 

[{'content': 'You are an text to SQL query translator. Users will ask you questions in English and you will generate a SQL query based on the provided SCHEMA.\nSCHEMA:\nCREATE TABLE table_1637981_7 (detroit__dtw_ VARCHAR, grand_rapids__grr_ VARCHAR)', 'role': 'system'}, {'content': "When Grand Rapids's fare was $377.29, what is the fare to Detroit?", 'role': 'user'}, {'content': 'SELECT detroit__dtw_ FROM table_1637981_7 WHERE grand_rapids__grr_ = "$377.29"', 'role': 'assistant'}]





In [9]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from trl import setup_chat_format
 
# Hugging Face model id
model_id = "meta-llama/Llama-3.2-1B-Instruct" # or `mistralai/Mistral-7B-v0.1`
 
# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)
 
# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    # attn_implementation="flash_attention_2",
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.padding_side = 'right' # to prevent warnings
 
# # set chat template to OAI chatML, remove if you start from a fine-tuned model
model, tokenizer = setup_chat_format(model, tokenizer, format="chatml")

In [13]:
print(tokenizer.apply_chat_template(
    dataset["train"][345]["messages"],
    tokenize=False,
    # chat_template="chatml"
    add_generation_prompt=True
))

<|im_start|>system
You are an text to SQL query translator. Users will ask you questions in English and you will generate a SQL query based on the provided SCHEMA.
SCHEMA:
CREATE TABLE table_1637981_7 (detroit__dtw_ VARCHAR, grand_rapids__grr_ VARCHAR)<|im_end|>
<|im_start|>user
When Grand Rapids's fare was $377.29, what is the fare to Detroit?<|im_end|>
<|im_start|>assistant
SELECT detroit__dtw_ FROM table_1637981_7 WHERE grand_rapids__grr_ = "$377.29"<|im_end|>
<|im_start|>assistant



In [7]:
dataset['train'][345]['messages']

[{'content': 'You are an text to SQL query translator. Users will ask you questions in English and you will generate a SQL query based on the provided SCHEMA.\nSCHEMA:\nCREATE TABLE table_1637981_7 (detroit__dtw_ VARCHAR, grand_rapids__grr_ VARCHAR)',
  'role': 'system'},
 {'content': "When Grand Rapids's fare was $377.29, what is the fare to Detroit?",
  'role': 'user'},
 {'content': 'SELECT detroit__dtw_ FROM table_1637981_7 WHERE grand_rapids__grr_ = "$377.29"',
  'role': 'assistant'}]