In [18]:
import json
# get development_data.json and ns_to_function_map.json
with open('development_data.json') as f:
    development_data = json.load(f)
with open('ns_to_function_map.json') as f:
    ns_to_function_map = json.load(f)

In [24]:
from collections import defaultdict
from datetime import datetime

# Group the data by conversationId
grouped_data_temp = defaultdict(list)
for entry in development_data:
    grouped_data_temp[entry["conversationId"]].append(entry)

# Sort each group by timeSent
for conversation_id, conversation in grouped_data_temp.items():
    conversation.sort(key=lambda x: datetime.strptime(x["timeSent"], "%Y-%m-%dT%H:%M:%S.%fZ"))

grouped_data = dict(grouped_data_temp)
list(grouped_data.keys())[0], grouped_data[list(grouped_data.keys())[0]]  # Displaying the first 5 keys and total number of unique conversations for reference


('ea463f0e-9cd1-4af6-87d1-8e722334b97b',
 [{'_id': '650348e0390748f6b0654082',
   'id': '9b205fdf-4eb9-48ca-9eb5-a1ebefb91ebb',
   'role': 'user',
   'chatgptMessage': {'role': 'user',
    'content': '1 sentence summary of https://www.openplugin.io/'},
   'openpluginNamespace': 'BrowserOp',
   'error': None,
   'timeSent': '2023-09-14T17:54:40.342Z',
   'timeComplete': '2023-09-14T17:54:40.342Z',
   'conversationId': 'ea463f0e-9cd1-4af6-87d1-8e722334b97b',
   'user_id': '64c4739bd594fb10ff6edbdd',
   'functions': [{'name': 'transcodeWebPage',
     'description': 'Acquire precise webpage details or real-time search engine responses based on user-input content.',
     'parameters': {'type': 'object',
      'properties': {'json': {'properties': {'link': {'type': 'string',
          'description': 'This parameter takes either a URL or a non-URL string. If a URL is given, the model will engage with the designated webpage to collect or interact with its data. If a non-URL string is given, th

In [25]:
conversations = []

for conversation_id, conversation in grouped_data.items():
    structured_conversation = {
        "conversationId": conversation_id,
        "conversation": []
    }
    
    for entry in conversation:
        structured_entry = {
            "chatgptMessage": entry["chatgptMessage"],
            "functions": None,
            "timeSent": entry["timeSent"]
        }
        if (entry["openpluginNamespace"] and entry["openpluginNamespace"] in ns_to_function_map):
            structured_entry["functions"] = ns_to_function_map[entry["openpluginNamespace"]]
        
        structured_conversation["conversation"].append(structured_entry)
    
    conversations.append(structured_conversation)

conversations[:1]  # Displaying the first conversation for reference


[{'conversationId': 'ea463f0e-9cd1-4af6-87d1-8e722334b97b',
  'conversation': [{'chatgptMessage': {'role': 'user',
     'content': '1 sentence summary of https://www.openplugin.io/'},
    'functions': [{'name': 'transcodeWebPage',
      'description': 'Acquire precise webpage details or real-time search engine responses based on user-input content.',
      'parameters': {'type': 'object',
       'properties': {'json': {'properties': {'link': {'type': 'string',
           'description': 'This parameter takes either a URL or a non-URL string. If a URL is given, the model will engage with the designated webpage to collect or interact with its data. If a non-URL string is given, the model will handle it as a search inquiry and try to find related real-time news or information. To guarantee the best results, make sure the input is a valid URL or a succinct search query.'}},
         'type': 'object'}}}}],
    'timeSent': '2023-09-14T17:54:40.342Z'},
   {'chatgptMessage': {'content': '{"resu

In [29]:
# iterate through all conversations and messages inside conversations and delete the message if there is no chatgptMessage
for conversation in conversations:
    conversation["conversation"] = [message for message in conversation["conversation"] if message["chatgptMessage"]]

# save
with open("conversations.json", "w") as file:
    json.dump(conversations, file)

# test with conversationId = hX1Dzy


In [30]:
def reorder_function_call_exchange(conversations):
    """
    Reorder instances where a "function" message is followed by an "assistant" message containing a "function_call".
    The "assistant" message will be moved to precede the "function" message.
    """
    for conversation in conversations:
        messages = conversation["conversation"]
        i = 0
        while i < len(messages) - 1:
            current_message = messages[i]
            next_message = messages[i+1]
            
            # Check for the specified sequence of "function" followed by "assistant" with "function_call"
            if (current_message["chatgptMessage"]["role"] == "function" and
                next_message["chatgptMessage"]["role"] == "assistant" and
                "function_call" in next_message["chatgptMessage"]):
                # Swap the messages
                messages[i], messages[i+1] = messages[i+1], messages[i]
                i += 2  # Skip the next message since we just swapped
            else:
                i += 1
    return conversations

conversations = reorder_function_call_exchange(conversations)

with open("conversations.json", "w") as file:
    json.dump(conversations, file)

# test with conversationId = 453e1f54-b604-4e0e-aa85-21febce5d6a4


In [37]:
# if there are any conversations that contain a message.chatgptMessage.role "function" whose previous message is not a message.chatgptMessage.role "assistant" with message.chatgptMessage.function_call, then delete the conversation
def delete_conversations_without_required_messages(conversations):
    """
    Delete conversations that do not contain a message with role "assistant" and "function_call" before a message with role "function".
    """
    conversations_to_delete = []
    for conversation in conversations:
        messages = conversation["conversation"]
        for i, message in enumerate(messages):
            if message["chatgptMessage"]["role"] == "function":
                if messages[i-1]["chatgptMessage"]["role"] != "assistant" or not messages[i-1]["chatgptMessage"]["function_call"]:
                    conversations_to_delete.append(conversation["conversationId"])
                    break
    return [conversation for conversation in conversations if conversation["conversationId"] not in conversations_to_delete]

conversations = delete_conversations_without_required_messages(conversations)

with open("conversations.json", "w") as file:
    json.dump(conversations, file)
# test with conversationId = ea463f0e-9cd1-4af6-87d1-8e722334b97b

In [38]:
len(conversations)  # Check the number of conversations

89