## Pre-Processing

In [1]:
base_model = "./mistral-7b-instruct-v02"

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline, logging
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
import os, torch
from datasets import load_dataset, load_from_disk
from trl import SFTTrainer
import pandas as pd
import json, re
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)

Directory containing the JSON files

In [3]:
directory = 'training_data-18'
numbers = re.findall(r'\d+', directory)

In [4]:
sys_prompt = """

You are a smart home IOT agent.
You will be given a user command enclosed within @@@ Task @@@.

You have to generate step by step execution plan for the given user command in json format enclosed between !!!:
!!!
%%%%%%%
{

    "tool_calls": [
        {
            "function_name": "function 1",
            "arguments": 
                {
                    "argument_name 1": "argument value"
                }
         },
         {
            "function_name": "function 2",
            "arguments": 
                {
                    "argument_name 1": "argument value",
                    "argument_name 2": "argument value"
                }
        }
    ]
}
%%%%%%%
!!!
Keywords are defined as below:

tool_calls: List of tool call objects, where each tool call object contains:  
    - function_name: It is name of single execution step for the complete plan. (e.g., identify_device, powerSwitch_getStatus, etc.) Plan can have 1 or more functions in response.
    - arguments: Dictionary of arguments for the function_name, which are required for execution of the function. (e.g., device, location, application, title, etc.)

Do not add any superfluous steps. Make sure that each step has all the information needed - do not skip steps.
If it is impossible to generate step by step plan, reply empty list of function. Ensure to enclose the output with "%%%%%%%" after generating with the json format. Do not generate anything after the second %%%%%%%.



"""

In [5]:
sys_prompt_implicit = """

You are a smart home IOT agent.
You will be given a user command enclosed within @@@ Task @@@.

You have to generate step by step execution plan for the given user command in json format enclosed between !!!:
!!!
%%%%%%%
{

    "tool_calls": [
        {
            "function_name": "function 1",
            "arguments": 
                {
                    "argument_name 1": "argument value"
                }
         },
         {
            "function_name": "function 2",
            "arguments": 
                {
                    "argument_name 1": "argument value",
                    "argument_name 2": "argument value"
                }
        }
    ]
}
%%%%%%%
!!!
Keywords are defined as below:

tool_calls: List of tool call objects, where each tool call object contains:  
    - function_name: It is name of single execution step for the complete plan. (e.g., identify_device, powerSwitch_getStatus, etc.) Plan can have 1 or more functions in response.
    - arguments: Dictionary of arguments for the function_name, which are required for execution of the function. (e.g., device, location, application, title, etc.)

Use homecontext to understand the devices, and only consider devices mentioned in the homecontext data. Do not generate prediction for irrelavant devices.

Do not add any superfluous steps. Make sure that each step has all the information needed - do not skip steps.
If it is impossible to generate step by step plan, reply empty list of function. Ensure to enclose the output with "%%%%%%%" after generating with the json format. Do not generate anything after the second %%%%%%%.



"""

Function to transform a single JSON file

In [6]:
def transform_json(input_json):
    chat = []
    
    # Extract system prompt, user content, and tools content
    system_prompt = sys_prompt
    user_content = ""
    tools_content = "### TOOLS ### "

    for message in input_json['messages']:
#        if message['role'] == 'system':
#            system_prompt = sys_prompt
        if message['role'] == 'context':
            system_prompt = sys_prompt
            system_prompt += " " + json.dumps(message['content'])
        elif message['role'] == 'user':
            user_content = '@@@'+message['content'] + "@@@"
#         elif message['role'] == 'tool':
#             tools_content = '### Tools ###'+json.dumps(message['content'])

    if 'tools' in input_json:
        tools_content += " " + json.dumps(input_json['tools'])

    combined_content = f"{system_prompt} {user_content} {tools_content}"
    chat.append({'role': 'user', 'content': combined_content})

    for message in input_json['messages']:
        if message['role'] == 'assistant' and 'tool_calls' in message:
            for call in message['tool_calls']:
                chat.append({'role': 'assistant', 'content': json.dumps(call)})
    

    for message in input_json['messages']:
        if message['role'] == 'tool':
            chat.append({'role': 'user', 'content': '### TOOLS RESPONSE ### '+json.dumps(message['content'])})

    final_assistant_message = next(
        (message['content'] for message in input_json['messages'] if message['role'] == 'assistant' and 'tool_calls' not in message), 
        None
    )
    if final_assistant_message:
        chat.append({'role': 'assistant', 'content': final_assistant_message})
    chat[1]['content'] = " %%%%%%% " + chat[1]['content']
    chat[1]['content'] += " %%%%%%% "
    
    return chat

#### Transform 2.0 code

This would convert the input json into the required formal using transform v1.0 approach.
Additionally it will add tools inside it which would be generated using the actions dir.

In [7]:
def transform_json_v2(input_json):
    chat = []
    
    # Extract system prompt, user content, and tools content
    system_prompt = sys_prompt
    user_content = ""
    tools_content = "### TOOLS ### "
    function_list = []
    DIR_NAME = "griham_actions"

    for message in input_json['messages']:
#        if message['role'] == 'system':
#            system_prompt = sys_prompt
        if message['role'] == 'context':
            system_prompt = sys_prompt
            system_prompt += " " + json.dumps(message['content'])
        elif message['role'] == 'user':
            user_content = '@@@'+message['content'] + "@@@"
#         elif message['role'] == 'tool':
#             tools_content = '### Tools ###'+json.dumps(message['content'])
        elif "tool_calls" in message:
            # update the functions used in the function_list
            for tool in message["tool_calls"]:
                for function in tool['function']:
                    function_list.append(function['name'])


    if 'tools' in input_json:
        transformed_tools = []
        for functionName in function_list:
            file_path = f"{DIR_NAME}/{functionName}.json"
            try:
                with open(file_path, 'r') as file:
                    function_data = json.load(file)
                    transformed_tools.append(convert_tools_to_required_format(function_data))
            except:
                print("Data could not be transformed for "+functionName)
        tools_content += " " + json.dumps(transformed_tools)

    combined_content = f"{system_prompt} {user_content} {tools_content}"
    chat.append({'role': 'user', 'content': combined_content})

    for message in input_json['messages']:
        if message['role'] == 'assistant' and 'tool_calls' in message:
            for call in message['tool_calls']:
                chat.append({'role': 'assistant', 'content': json.dumps(call)})
    

    for message in input_json['messages']:
        if message['role'] == 'tool':
            chat.append({'role': 'user', 'content': '### TOOLS RESPONSE ### '+json.dumps(message['content'])})

    final_assistant_message = next(
        (message['content'] for message in input_json['messages'] if message['role'] == 'assistant' and 'tool_calls' not in message), 
        None
    )
    if final_assistant_message:
        chat.append({'role': 'assistant', 'content': final_assistant_message})
    chat[1]['content'] = " %%%%%%% " + chat[1]['content']
    chat[1]['content'] += " %%%%%%% "
    
    return chat


def convert_tools_to_required_format(input_json):
    # Parse the input JSON string into a dictionary
    input_dict = input_json

    # Create a new dictionary to store the converted JSON data
    output_dict = {
        "type": "function",
        "function": [
            {
                "name": input_dict["name"],
                "description": input_dict["description"],
                "parameters": {
                    "type": "object",
                    "properties": {},
                    "required": []
                }
            }
        ]
    }
    # Extract the parameters from the input dictionary and format them accordingly
    for param in input_dict["parameters"]:
        param_info = {
            "type": param["type"],
            "description": param["description"]
        }
        output_dict["function"][0]["parameters"]["properties"][param["name"]] = param_info
        
        if not param["optional"]:
            output_dict["function"][0]["parameters"]["required"].append(param["name"])

    return output_dict

In [10]:
import json

# Opening JSON file
f = open(directory + '/brightness_getValue_0.json')

# returns JSON object as 
# a dictionary
x = json.load(f)
y = transform_json_v2(x)
print(y)

[{'role': 'user', 'content': '\n\nYou are a smart home IOT agent.\nYou will be given a user command enclosed within @@@ Task @@@.\n\nYou have to generate step by step execution plan for the given user command in json format enclosed between !!!:\n!!!\n%%%%%%%\n{\n\n    "tool_calls": [\n        {\n            "function_name": "function 1",\n            "arguments": \n                {\n                    "argument_name 1": "argument value"\n                }\n         },\n         {\n            "function_name": "function 2",\n            "arguments": \n                {\n                    "argument_name 1": "argument value",\n                    "argument_name 2": "argument value"\n                }\n        }\n    ]\n}\n%%%%%%%\n!!!\nKeywords are defined as below:\n\ntool_calls: List of tool call objects, where each tool call object contains:  \n    - function_name: It is name of single execution step for the complete plan. (e.g., identify_device, powerSwitch_getStatus, etc.) Plan 

In [None]:
# Opening JSON file
f = open(directory+'/Template1_TC_01.json')

# returns JSON object as 
# a dictionary
x = json.load(f)
y = transform_json(x)
print(y)

In [45]:
temp = y[1]
temp['content'].split("%%%%%%%")

[' ',
 ' {"type": "function", "function": [{"name": "identify_device", "arguments": {"device": "light", "location": "kitchen"}}, {"name": "brightness_getValue", "arguments": {"device": "#E1"}}]} ',
 ' ']

Initialize list to store encoded strings

In [12]:
encoded_strings = []

Iterate over all JSON files in the directory

In [13]:
for filename in os.listdir(directory):
    if filename.endswith('.json'):
        
        filepath = os.path.join(directory, filename)
        with open(filepath, 'r') as file:
            input_json = json.load(file)
        
        
        transformed_json = transform_json_v2(input_json)
        
        folder_path = 'edited_transformed_json-tr' + numbers[0]
        if not os.path.exists(folder_path):
            # Create the folder
            os.makedirs(folder_path)
        output_filepath = os.path.join(folder_path, 'transformed_' + filename)
        with open(output_filepath, 'w') as output_file:
            json.dump(transformed_json, output_file, indent=2)
        
        
        with open(output_filepath, 'r') as transformed_file:
            transformed_data = json.load(transformed_file)
            encoded_string = tokenizer.apply_chat_template(transformed_data, tokenize=False, add_generation_prompt=True)
            encoded_strings.append(encoded_string)


convert to csv and save

In [14]:
df = pd.DataFrame({'train': encoded_strings})
df.to_csv('training_data_encoded-' + numbers[0] +'.csv', index=False)

In [15]:
df

Unnamed: 0,train
0,<s>[INST] \n\nYou are a smart home IOT agent.\...
1,<s>[INST] \n\nYou are a smart home IOT agent.\...
2,<s>[INST] \n\nYou are a smart home IOT agent.\...
3,<s>[INST] \n\nYou are a smart home IOT agent.\...
4,<s>[INST] \n\nYou are a smart home IOT agent.\...
...,...
112,<s>[INST] \n\nYou are a smart home IOT agent.\...
113,<s>[INST] \n\nYou are a smart home IOT agent.\...
114,<s>[INST] \n\nYou are a smart home IOT agent.\...
115,<s>[INST] \n\nYou are a smart home IOT agent.\...


In [16]:
st = df['train'][1]
temp = st.split("%%%%%%%")
print(temp[5])

 {"type": "function", "function": [{"name": "identify_device", "arguments": {"device": "AC"}}, {"name": "airConditioningMode_getValue", "arguments": {"device": "#E1"}}]} 


In [17]:
import pandas as pd 
print("Loading dataset training_data_encoded-" + numbers[0] +'.csv' + ' ....')
df2 = pd.read_csv('training_data_encoded-' + numbers[0] +'.csv')
df2

Loading dataset training_data_encoded-18.csv ....


Unnamed: 0,train
0,<s>[INST] \n\nYou are a smart home IOT agent.\...
1,<s>[INST] \n\nYou are a smart home IOT agent.\...
2,<s>[INST] \n\nYou are a smart home IOT agent.\...
3,<s>[INST] \n\nYou are a smart home IOT agent.\...
4,<s>[INST] \n\nYou are a smart home IOT agent.\...
...,...
112,<s>[INST] \n\nYou are a smart home IOT agent.\...
113,<s>[INST] \n\nYou are a smart home IOT agent.\...
114,<s>[INST] \n\nYou are a smart home IOT agent.\...
115,<s>[INST] \n\nYou are a smart home IOT agent.\...


In [18]:
df2['train'][1]

'<s>[INST] \n\nYou are a smart home IOT agent.\nYou will be given a user command enclosed within @@@ Task @@@.\n\nYou have to generate step by step execution plan for the given user command in json format enclosed between !!!:\n!!!\n%%%%%%%\n{\n\n    "tool_calls": [\n        {\n            "function_name": "function 1",\n            "arguments": \n                {\n                    "argument_name 1": "argument value"\n                }\n         },\n         {\n            "function_name": "function 2",\n            "arguments": \n                {\n                    "argument_name 1": "argument value",\n                    "argument_name 2": "argument value"\n                }\n        }\n    ]\n}\n%%%%%%%\n!!!\nKeywords are defined as below:\n\ntool_calls: List of tool call objects, where each tool call object contains:  \n    - function_name: It is name of single execution step for the complete plan. (e.g., identify_device, powerSwitch_getStatus, etc.) Plan can have 1 or more 