In [29]:
import json
# get development_data.json and ns_to_function_map.json
with open('production_data.json', encoding='utf-8') as f:
    development_data = json.load(f)
with open('ns_to_function_map.json') as f:
    ns_to_function_map = json.load(f)
save_to_file = "production_conversations.json"

In [30]:
from collections import defaultdict
from datetime import datetime

# Group the data by conversationId
grouped_data_temp = defaultdict(list)
for entry in development_data:
    grouped_data_temp[entry["conversationId"]].append(entry)

# Sort each group by timeSent
for conversation_id, conversation in grouped_data_temp.items():
    conversation.sort(key=lambda x: datetime.strptime(x["timeSent"], "%Y-%m-%dT%H:%M:%S.%fZ"))

grouped_data = dict(grouped_data_temp)
list(grouped_data.keys())[0], grouped_data[list(grouped_data.keys())[0]]  # Displaying the first 5 keys and total number of unique conversations for reference


('c9175fb9-1f09-49c5-bed2-593921504b5a',
 [{'_id': '65039c6f4210447e1d2f1946',
   'id': '81c67ab9-fe85-426f-8d82-5a775ed0edee',
   'role': 'user',
   'chatgptMessage': {'role': 'user', 'content': 'To build a new app '},
   'openpluginNamespace': None,
   'error': None,
   'timeSent': '2023-09-14T23:51:12.048Z',
   'timeComplete': '2023-09-14T23:51:12.048Z',
   'conversationId': 'c9175fb9-1f09-49c5-bed2-593921504b5a',
   'user_id': '64bf281578f994e79e237dcb'},
  {'_id': '65039c7ff360b3d5defdef9e',
   'id': '0785053e-7b17-4694-90c9-9d527713ab14',
   'role': 'assistant',
   'chatgptMessage': {'role': 'assistant',
    'content': "To build a new app, you will need to follow these steps:\n\n1. Define the purpose and target audience of your app: Determine what problem your app will solve and who will benefit from using it. This will help you create a clear vision for your app.\n\n2. Conduct market research: Analyze the existing apps in the market that are similar to your idea. Identify their 

In [31]:
conversations = []

for conversation_id, conversation in grouped_data.items():
    structured_conversation = {
        "conversationId": conversation_id,
        "conversation": []
    }
    
    for entry in conversation:
        structured_entry = {
            "chatgptMessage": entry["chatgptMessage"],
            "functions": None,
            "timeSent": entry["timeSent"]
        }
        if (entry["openpluginNamespace"] and entry["openpluginNamespace"] in ns_to_function_map):
            structured_entry["functions"] = ns_to_function_map[entry["openpluginNamespace"]]
        
        structured_conversation["conversation"].append(structured_entry)
    
    conversations.append(structured_conversation)

conversations[:1]  # Displaying the first conversation for reference


[{'conversationId': 'c9175fb9-1f09-49c5-bed2-593921504b5a',
  'conversation': [{'chatgptMessage': {'role': 'user',
     'content': 'To build a new app '},
    'functions': None,
    'timeSent': '2023-09-14T23:51:12.048Z'},
   {'chatgptMessage': {'role': 'assistant',
     'content': "To build a new app, you will need to follow these steps:\n\n1. Define the purpose and target audience of your app: Determine what problem your app will solve and who will benefit from using it. This will help you create a clear vision for your app.\n\n2. Conduct market research: Analyze the existing apps in the market that are similar to your idea. Identify their strengths and weaknesses to understand how you can differentiate your app and provide a better user experience.\n\n3. Create a wireframe or prototype: Sketch out the basic layout and functionality of your app. This will help you visualize the user interface and user experience before moving on to development.\n\n4. Design the user interface (UI): C

In [32]:

# iterate through all conversations and messages inside conversations and delete the message if there is no chatgptMessage
for conversation in conversations:
    conversation["conversation"] = [message for message in conversation["conversation"] if message["chatgptMessage"]]

# save
with open(save_to_file, "w") as file:
    json.dump(conversations, file)

# test with conversationId = hX1Dzy


In [33]:
len(conversations)

557

In [38]:
# if there are any conversations that contain a message.chatgptMessage.role "function" whose previous message is not a message.chatgptMessage.role "assistant" with message.chatgptMessage.function_call, then delete the conversation
def delete_conversations_without_required_messages(conversations):
    """
    Delete conversations that do not contain a message with role "assistant" and "function_call" before a message with role "function".
    """
    conversations_to_delete = []
    for conversation in conversations:
        messages = conversation["conversation"]
        for i, message in enumerate(messages):
            if "role" not in message["chatgptMessage"]:
                conversations_to_delete.append(conversation["conversationId"])
                break
            # delete if a function message is not preceeded by a chatgpt assistant function_call
            if message["chatgptMessage"]["role"] == "function":
                if messages[i-1]["chatgptMessage"]["role"] != "assistant" or not messages[i-1]["chatgptMessage"].get("function_call"):
                    conversations_to_delete.append(conversation["conversationId"])
                    break
    return [conversation for conversation in conversations if conversation["conversationId"] not in conversations_to_delete]

conversations = delete_conversations_without_required_messages(conversations)

with open(save_to_file, "w") as file:
    json.dump(conversations, file)
# test with conversationId = ea463f0e-9cd1-4af6-87d1-8e722334b97b

In [39]:
len(conversations)

514

In [40]:
def reorder_function_call_exchange(conversations):
    """
    Reorder instances where a "function" message is followed by an "assistant" message containing a "function_call".
    The "assistant" message will be moved to precede the "function" message.
    """
    for conversation in conversations:
        messages = conversation["conversation"]
        i = 0
        while i < len(messages) - 1:
            current_message = messages[i]
            next_message = messages[i+1]
            
            # Check for the specified sequence of "function" followed by "assistant" with "function_call"
            if (current_message["chatgptMessage"]["role"] == "function" and
                next_message["chatgptMessage"]["role"] == "assistant" and
                "function_call" in next_message["chatgptMessage"]):
                # Swap the messages
                messages[i], messages[i+1] = messages[i+1], messages[i]
                i += 2  # Skip the next message since we just swapped
            else:
                i += 1
    return conversations

conversations = reorder_function_call_exchange(conversations)

with open(save_to_file, "w") as file:
    json.dump(conversations, file)

# test with conversationId = 453e1f54-b604-4e0e-aa85-21febce5d6a4


In [41]:
len(conversations)  # Check the number of conversations

514

In [47]:
conversations[310]

{'conversationId': 'bd689eb1-8fe2-4bc5-9086-822fd7ac0364',
 'conversation': [{'chatgptMessage': {'role': 'user',
    'content': 'Show me 3 scholarly articles on Large Language Models'},
   'functions': [{'name': 'searchArxiv',
     'description': 'Search Arxiv',
     'parameters': {'type': 'object',
      'properties': {'path_params': {'type': 'object',
        'properties': {'content': {'type': 'string',
          'description': 'The search query is generally in English. If it is not, please translate it into English first.'}},
        'required': ['content']}}}}],
   'timeSent': '2023-10-17T14:21:10.515Z'},
  {'chatgptMessage': {'content': None,
    'function_call': {'arguments': '{}', 'name': 'searchArxiv'},
    'role': 'assistant'},
   'functions': [{'name': 'searchArxiv',
     'description': 'Search Arxiv',
     'parameters': {'type': 'object',
      'properties': {'path_params': {'type': 'object',
        'properties': {'content': {'type': 'string',
          'description': 'The 