In [1]:
import json

with open('../production_data_without_summary.json') as f:
    data = json.load(f)

# new tokens dictionary
function_calling_tokens = {
    "FUNCTIONS": {
        "start": "<FUNCTIONS>",
        "end": "</FUNCTIONS>"
    },
    "FUNCTION_CALL_NAME": {
        "start": "<FUNCTION_CALL_NAME>",
        "end": "</FUNCTION_CALL_NAME>"
    },
    "FUNCTION_CALL_ARGUMENTS": {
        "start": "<FUNCTION_CALL_ARGUMENTS>",
        "end": "</FUNCTION_CALL_ARGUMENTS>"
    },
    "all": ["<FUNCTIONS>", "</FUNCTIONS>", "<FUNCTION_CALL_NAME>", "</FUNCTION_CALL_NAME>", "<FUNCTION_CALL_ARGUMENTS>", "</FUNCTION_CALL_ARGUMENTS>"]
}

# Now you can use the `data` variable to access the contents of the JSON file
data_no_fc = data[0]
print(json.dumps(data_no_fc, indent=4), end='\n\n\n')
data_fc = data[30]
print(json.dumps(data_fc, indent=4), end='\n\n\n')

{
    "input": [
        {
            "chatgptMessage": {
                "role": "user",
                "content": "hi"
            },
            "functions": [
                {
                    "name": "transcodeWebPage",
                    "description": "Acquire precise webpage details or real-time search engine responses based on user-input content.",
                    "parameters": {
                        "type": "object",
                        "properties": {
                            "json": {
                                "properties": {
                                    "link": {
                                        "type": "string",
                                        "description": "This parameter takes either a URL or a non-URL string. If a URL is given, the model will engage with the designated webpage to collect or interact with its data. If a non-URL string is given, the model will handle it as a search inquiry and try to find related real-tim

In [2]:
def build_prompt(instance):
    """
    Function to dynamically build a prompt based on the given instance, using a dictionary of function calling tokens.
    
    :param instance: A dictionary representing a single instance of the data.
    :param function_calling_tokens: A dictionary containing the start and end tokens for different function call elements.
    :return: A string representing the constructed prompt.
    """
    input_message = instance['input'][0]['chatgptMessage']['content']
    target_message = instance['target']['chatgptMessage']
    functions = instance['input'][0]['functions']

    # Extracting function details as is
    functions_str = json.dumps(functions)

    # Building the prompt using the tokens from function_calling_tokens
    if 'function_call' in target_message:
        function_call_name = target_message['function_call']['name']
        function_call_arguments = target_message['function_call']['arguments']
        prompt = f"<s>[INST] <<SYS>>\n{function_calling_tokens['FUNCTIONS']['start']}{functions_str}{function_calling_tokens['FUNCTIONS']['end']}\n<</SYS>>\n\n{input_message} [/INST] {function_calling_tokens['FUNCTION_CALL_NAME']['start']}{function_call_name}{function_calling_tokens['FUNCTION_CALL_NAME']['end']}{function_calling_tokens['FUNCTION_CALL_ARGUMENTS']['start']}{function_call_arguments}{function_calling_tokens['FUNCTION_CALL_ARGUMENTS']['end']} </s>"
    else:
        target_content = target_message.get('content', '')
        prompt = f"<s>[INST] <<SYS>>\n{function_calling_tokens['FUNCTIONS']['start']}{functions_str}{function_calling_tokens['FUNCTIONS']['end']}\n<</SYS>>\n\n{input_message} [/INST] {target_content}</s>"

    return prompt

test_prompt_no_fc = build_prompt(data_no_fc)
print(test_prompt_no_fc, end='\n\n\n')
test_prompt_fc = build_prompt(data_fc)
print(test_prompt_fc, end='\n\n\n')


<s>[INST] <<SYS>>
<FUNCTIONS>[{"name": "transcodeWebPage", "description": "Acquire precise webpage details or real-time search engine responses based on user-input content.", "parameters": {"type": "object", "properties": {"json": {"properties": {"link": {"type": "string", "description": "This parameter takes either a URL or a non-URL string. If a URL is given, the model will engage with the designated webpage to collect or interact with its data. If a non-URL string is given, the model will handle it as a search inquiry and try to find related real-time news or information. To guarantee the best results, make sure the input is a valid URL or a succinct search query."}}, "type": "object"}}}}]</FUNCTIONS>
<</SYS>>

hi [/INST] Hello! How can I assist you today?</s>


<s>[INST] <<SYS>>
<FUNCTIONS>[{"name": "transcodeWebPage", "description": "Acquire precise webpage details or real-time search engine responses based on user-input content.", "parameters": {"type": "object", "properties": {"

In [3]:
import re
def parse_prompt_back_to_data(prompt):
    """
    Function to parse a prompt back into the original data format, using a dictionary of function calling tokens.
    
    :param prompt: A string representing the constructed prompt.
    :param function_calling_tokens: A dictionary containing the start and end tokens for different function call elements.
    :return: A dictionary representing the original data instance.
    """
    # Building regular expression patterns using the function_calling_tokens
    functions_pattern = rf"{function_calling_tokens['FUNCTIONS']['start']}(.*?){function_calling_tokens['FUNCTIONS']['end']}"
    input_pattern = r"<</SYS>>\n\n(.*?) \[/INST\]"  # This remains unchanged as it's not part of function_calling_tokens
    target_content_pattern = r"\[/INST\] (.*)</s>"  # This also remains unchanged
    function_call_name_pattern = rf"{function_calling_tokens['FUNCTION_CALL_NAME']['start']}(.*?){function_calling_tokens['FUNCTION_CALL_NAME']['end']}"
    function_call_arguments_pattern = rf"{function_calling_tokens['FUNCTION_CALL_ARGUMENTS']['start']}(.*?){function_calling_tokens['FUNCTION_CALL_ARGUMENTS']['end']}"

    # Extracting data using regular expressions
    functions_str = re.search(functions_pattern, prompt).group(1)
    input_content = re.search(input_pattern, prompt).group(1)
    target_content_match = re.search(target_content_pattern, prompt)

    # Parse functions JSON string
    functions = json.loads(functions_str)

    # Prepare the data dictionary
    data = {
        "input": [{
            "chatgptMessage": {"role": "user", "content": input_content},
            "functions": functions
        }],
        "target": {
            "chatgptMessage": {"role": "assistant"},
            "functions": functions  # Including functions in the target as well
        }
    }

    # Check if the target has a function call
    if function_calling_tokens['FUNCTION_CALL_NAME']['start'] in prompt:
        function_call_name = re.search(function_call_name_pattern, prompt).group(1)
        function_call_arguments = re.search(function_call_arguments_pattern, prompt).group(1)
        data["target"]["chatgptMessage"]["function_call"] = {
            "name": function_call_name,
            "arguments": function_call_arguments
        }
    else:
        # Handle case where regex might not find a match for target content
        if target_content_match:
            target_content = target_content_match.group(1)
            data["target"]["chatgptMessage"]["content"] = target_content

    return data

# Test the function with the prompt created earlier
parsed_data_example_no_fc = parse_prompt_back_to_data(test_prompt_no_fc)
print(json.dumps(parsed_data_example_no_fc, indent=4), end='\n\n\n')
parsed_data_example_fc = parse_prompt_back_to_data(test_prompt_fc)
print(json.dumps(parsed_data_example_fc, indent=4), end='\n\n\n')


{
    "input": [
        {
            "chatgptMessage": {
                "role": "user",
                "content": "hi"
            },
            "functions": [
                {
                    "name": "transcodeWebPage",
                    "description": "Acquire precise webpage details or real-time search engine responses based on user-input content.",
                    "parameters": {
                        "type": "object",
                        "properties": {
                            "json": {
                                "properties": {
                                    "link": {
                                        "type": "string",
                                        "description": "This parameter takes either a URL or a non-URL string. If a URL is given, the model will engage with the designated webpage to collect or interact with its data. If a non-URL string is given, the model will handle it as a search inquiry and try to find related real-tim

In [5]:
# save production_data_train_chat
print("data: ", len(data))
modified_data = []
for instance in data:
    try:
        modified_data.append({"text": build_prompt(instance)})
    except:
        continue
print("modified data: ", len(modified_data))
print(modified_data[0]["text"], end='\n\n\n')
print(modified_data[30]["text"], end='\n\n\n')

with open('production_train_chat.json', 'w') as f:
    json.dump(modified_data, f)

data:  1369
modified data:  1367
<s>[INST] <<SYS>>
<FUNCTIONS>[{"name": "transcodeWebPage", "description": "Acquire precise webpage details or real-time search engine responses based on user-input content.", "parameters": {"type": "object", "properties": {"json": {"properties": {"link": {"type": "string", "description": "This parameter takes either a URL or a non-URL string. If a URL is given, the model will engage with the designated webpage to collect or interact with its data. If a non-URL string is given, the model will handle it as a search inquiry and try to find related real-time news or information. To guarantee the best results, make sure the input is a valid URL or a succinct search query."}}, "type": "object"}}}}]</FUNCTIONS>
<</SYS>>

hi [/INST] Hello! How can I assist you today?</s>


<s>[INST] <<SYS>>
<FUNCTIONS>[{"name": "transcodeWebPage", "description": "Acquire precise webpage details or real-time search engine responses based on user-input content.", "parameters": {"