In [2]:
# add parent directory to path
import sys
import os

project_dir = os.getcwd()
parent_dir = os.path.dirname(project_dir)
grandparent_dir = os.path.dirname(parent_dir)
sys.path.insert(0, grandparent_dir)

In [2]:
import json
from datasets import load_dataset
from gaitor_function_calling.data.prompting_utils import INSTRUCTION, function_calling_tokens


# Load the dataset
dataset = load_dataset('rizerphe/glaive-function-calling-v2-llama')

# Define a function to transform the data
def transform_data(entry):
    # Split the entry at [/INST] to separate the input and output messages
    if "<<SYS>>" not in entry:
        return None

    parts = entry.split("[/INST]")
    input_message = parts[0].split("<</SYS>>")[1].strip()
    input_function = parts[0].split("<</SYS>>")[0].split("<function>")[2].strip()
    
    # find first message with <function> response
    for part in parts[1:]:
        if "<function>" in part:
            output_message = part.split("<function>")[1].split("</s>")[0]
            break
    else:
        return None
    
    output_message_name = output_message.split("{")[0].strip()
    if not output_message_name:
        json_input_message = json.loads(input_function)
        output_message_name = json_input_message["name"].strip()
    
    if output_message_name in output_message:
        output_message_arguments = output_message.split(output_message_name)[1].strip()
    else:
        output_message_arguments = output_message.strip()
    if not output_message_arguments:
        return None

    # Extract the system message including function details
    sys_message = parts[0].split("<function>")[1].split("<</SYS>>")[0].strip()

    # compose functions
    input_functions = "[" + input_function + "]"

    # Formatting the new entry
    transformed_entry = f"<s>[INST] <<SYS>>\n{INSTRUCTION}" + function_calling_tokens["FUNCTIONS"]["start"] + input_functions + function_calling_tokens["FUNCTIONS"]["end"] + f"\n<</SYS>>\n\n{input_message} [/INST] " + function_calling_tokens["FUNCTION_CALL_NAME"]["start"] + output_message_name + function_calling_tokens["FUNCTION_CALL_NAME"]["end"] + function_calling_tokens["FUNCTION_CALL_ARGUMENTS"]["start"] + output_message_arguments + function_calling_tokens["FUNCTION_CALL_ARGUMENTS"]["end"] + f" </s>"
    return transformed_entry

# Apply the transformation
transformed_dataset = [transform_data(entry['text']) for entry in dataset['train']]
print(f"total entries: {len(transformed_dataset)}")
# only return non-null entries
transformed_dataset = [entry for entry in transformed_dataset if entry is not None]
print(f"total non-null entries: {len(transformed_dataset)}")


Found cached dataset parquet (C:/Users/1seba/.cache/huggingface/datasets/rizerphe___parquet/rizerphe--glaive-function-calling-v2-llama-04a01feb10d7a4f7/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)


  0%|          | 0/1 [00:00<?, ?it/s]

total entries: 103091
total non-null entries: 51885


In [3]:
print(transformed_dataset[45000])

<s>[INST] <<SYS>>
Your job is to identify weather or not the user input is related to the function specification delimited by <FUNCTIONS> and </FUNCTIONS>. If it is related then your response should be in the function_calling format: <FUNCTION_CALL_NAME>NAME_ASSOCIATED_WITH_THE_FUNCTION</FUNCTION_CALL_NAME><FUNCTION_CALL_ARGUMENTS>ARGUMENTS_IN_STRINGIFIED_JSON_FORMAT</FUNCTION_CALL_ARGUMENTS>. Otherwise simply return a normal response. <FUNCTIONS>[{
    "name": "translate_text",
    "description": "Translate text from one language to another",
    "parameters": {
        "type": "object",
        "properties": {
            "text": {
                "type": "string",
                "description": "The text to be translated"
            },
            "source_language": {
                "type": "string",
                "description": "The source language of the text"
            },
            "target_language": {
                "type": "string",
                "description": "The 

In [4]:
from gaitor_function_calling.data.prompting_utils import parse_prompt_back_to_data, INSTRUCTION
jsonified_dataset = [parse_prompt_back_to_data(entry, INSTRUCTION) for entry in transformed_dataset]

In [14]:
# save to  cwd + /base/glaive_dataset-only_fc.json
current_dir = os.getcwd()
with open(current_dir + "/base/glaive_dataset-only_fc.json", "w") as f:
    json.dump(jsonified_dataset, f, indent=4)

In [3]:
from gaitor_function_calling.data.data_utils import DataAbstractor
from gaitor_function_calling.data.prompting_utils import INSTRUCTION
current_dir = os.getcwd()
data_abstractor = DataAbstractor(current_dir + "/base/glaive_dataset-only_fc.json", "glaive-full_train")
train, test = data_abstractor.build_data(INSTRUCTION, False, 1)
data_abstractor.save(data_abstractor.raw_data, train, test)

No train data found
No test data found
Train data size: 51885
Test data size: 0
Raw data saved to c:\Projects\function_calling\gaitor_function_calling\data/base\glaive_dataset-only_fc-train-glaive-full_train\raw.json
Train data saved to c:\Projects\function_calling\gaitor_function_calling\data/base\glaive_dataset-only_fc-train-glaive-full_train\train.json


1

In [16]:
from gaitor_function_calling.data.data_utils import DataAbstractor
current_dir = os.getcwd()
data_abstractor = DataAbstractor(os.getcwd() + "/train_test/glaive_dataset-only_fc-train-glaive-full_train/raw.json", "glaive-full_train")


No test data found


51885

In [17]:
data_abstractor.train_data[0]

{'text': '<s>[INST] <<SYS>>\nYour job is to identify weather or not the user input is related to the function specification delimited by <FUNCTIONS> and </FUNCTIONS>. If it is related then your response should be in the function_calling format: <FUNCTION_CALL_NAME>NAME_ASSOCIATED_WITH_THE_FUNCTION</FUNCTION_CALL_NAME><FUNCTION_CALL_ARGUMENTS>ARGUMENTS_IN_STRINGIFIED_JSON_FORMAT</FUNCTION_CALL_ARGUMENTS>. Otherwise simply return a normal response. \n<FUNCTIONS>[{"name": "get_news_headlines", "description": "Get the latest news headlines", "parameters": {"type": "object", "properties": {"country": {"type": "string", "description": "The country for which to fetch news"}}, "required": ["country"]}}]</FUNCTIONS>\n<</SYS>>\n\nCan you tell me the latest news headlines for the United States?  [/INST] <FUNCTION_CALL_NAME>get_news_headlines</FUNCTION_CALL_NAME><FUNCTION_CALL_ARGUMENTS>{\n    "country": "United States"\n}</FUNCTION_CALL_ARGUMENTS> </s>'}

In [10]:
os.getcwd()

'c:\\Projects\\function_calling\\gaitor_function_calling\\data'