In [28]:
# Python lib
import asyncio
from pathlib import Path
import pandas as pd
from tqdm import tqdm
from typing import Dict, Any, Tuple, List
import venv
import os

# Autogen-0.4
from autogen_agentchat.agents import AssistantAgent
from autogen_agentchat.agents._code_executor_agent import CodeExecutorAgent
from autogen_agentchat.conditions import TextMentionTermination, MaxMessageTermination
from autogen_agentchat.messages import TextMessage
from autogen_agentchat.teams import RoundRobinGroupChat
from autogen_agentchat.ui import Console
from autogen_ext.models.openai import OpenAIChatCompletionClient
from autogen_core.model_context import BufferedChatCompletionContext

# Local
from utils import GetDatasetProfile

In [35]:
root = '../sheets/mysql/'
files: list[str] = os.listdir(path=root)
tasks = [GetDatasetProfile(root, file_name = file, output_format='json') for file in files]
results = await asyncio.gather(*tasks)

Processing 13 columns (customer.csv)                                   [32m██████████████████████████████[0m|  13/13  
Processing 8 columns (employee.csv)                                    [32m██████████████████████████████[0m|   8/8   
Processing 9 columns (office.csv)                                      [32m██████████████████████████████[0m|   9/9   
Processing 5 columns (orderdetails.csv)                                [32m██████████████████████████████[0m|   5/5   
Processing 7 columns (orders.csv)                                      [32m██████████████████████████████[0m|   7/7   
Processing 4 columns (payments.csv)                                    [32m██████████████████████████████[0m|   4/4   
Processing 4 columns (productlines.csv)                                [32m██████████████████████████████[0m|   4/4   
Processing 9 columns (products.csv)                                    [32m██████████████████████████████[0m|   9/9   


In [52]:
result_dict = zip(files, results)
results = list(result_dict)

In [27]:
# Only edit here AND filepath under if __name__ == "__main__":
reasoning_model = "qwen2.5:32b-instruct-q8_0"
coding_model = "qwen2.5-coder:32b-instruct-q8_0"

# Common config
llm_base_url = "http://34.204.63.234:11434/v1"
api_key = "none"
capabilities =  {
        "vision": False,
        "function_calling": False,
        "json_output": False
    }

#######################################################################
#   !!! DONT EDIT BELOW EXCEPT FOR if __name__ == "__main__":   !!!   #
#######################################################################

# Reasoning Model Configuration
instruct_client_config = OpenAIChatCompletionClient(
    model=reasoning_model,
    base_url=llm_base_url,
    api_key=api_key,
    model_capabilities=capabilities
)

# Coding Model Configuration
code_client_config = OpenAIChatCompletionClient(
    model=coding_model,
    base_url=llm_base_url,
    api_key=api_key,
    model_capabilities=capabilities
)


In [63]:
results[1][0]

'employee.csv'

In [97]:
async def fix_file_name(filename):
    # Turn file name into Python identifier
    # Since Python identifier can't contain '.', we replace it with _
    last_5_chars = list(filename)[-5:]
    replace_dot = [char.replace('.', '_') for char in last_5_chars]
    all_chars_before_last_5 = list(filename)[:-5]
    all_chars_before_last_5.extend(replace_dot)
    return ''.join(all_chars_before_last_5)

In [139]:

async def initialize_individual_chat(filename: str, metadata):
    
    file_name_fixed = await fix_file_name(filename)
    
    # Initialize agents dynamically
    file_handler = AssistantAgent(
        name=f"File_handler_{file_name_fixed}",
        description=f"A file handling agent specific for the file {filename}.",
        model_client=instruct_client_config,
        system_message=f"""<purpose>You are a file handling agent for the file {filename}. You will populate a data dictionary for this particular dataset.</purpose>
        
<instructions>Given the metadata of the file, suggest possible meanings and explain abbreviations of columns in the file. Suggest common column names that would be analyzed together. MUST follow output_format.</instructions>

<output_format>
Data Dictionary for {filename}:

---

Column: <column_name>
Possible meaning: <meaning>
Description: <description>
Possible relationships with other columns: <relationships>

---
</output_example>

<rules>
OMIT performing any other actions other than those specified.
OMIT speaking more than necessary.
Must follow output_format.
</rules>""",
    )
    
    response = await file_handler.on_messages(
        [TextMessage(content=f"You are a file handling agent for the file {filename}. The file metadata is as follow: {str(metadata)}. Suggest possible meanings and abbreviations of columns in the file.", source="user")], None
    )
    print(response)
    return response



In [140]:
# Create a list of tasks for all agents
tasks = [initialize_individual_chat(filename=results[index][0], metadata=results[index][1]) for index in range(len(results))]
    
# Execute all agent chat tasks concurrently
responses = await asyncio.gather(*tasks)

# Example usage
print(responses)
print(len(responses))

Response(chat_message=TextMessage(source='File_handler_payments_csv', models_usage=RequestUsage(prompt_tokens=563, completion_tokens=401), content="Data Dictionary for payments.csv:\n\n---\n\nColumn: customerNumber\nPossible meaning: Unique identifier for a customer.\nDescription: A unique integer that represents each individual or entity making a payment. The range of values indicates there are up to 374 distinct customers based on the minimum and maximum values, despite only having observations for 273 payments.\nPossible relationships with other columns: This could be joined with another table containing customer details; it likely correlates with 'checkNumber' and 'paymentDate', as multiple payments over time can be attributed to a specific customer.\n\n---\n\nColumn: checkNumber\nPossible meaning: The identifier number of the bank or payment check.\nDescription: An alphanumeric field that uniquely identifies each payment instrument. Each entry appears unique (273 total counts, 273

In [144]:
responses[1].chat_message

TextMessage(source='File_handler_employee_csv', models_usage=RequestUsage(prompt_tokens=783, completion_tokens=626), content="Data Dictionary for employee.csv:\n\n---\n\nColumn: employeeNumber\nPossible meaning: A unique identifier for each employee.\nDescription: This integer-based column likely serves as a primary key within the dataset, distinguishing one record from another with no missing values.\nPossible relationships with other columns: Commonly linked to reportsTo to trace hierarchical reporting relationships.\n\n---\n\nColumn: lastName\nPossible meaning: The family name or surname of an employee.\nDescription: Containing textual data that uniquely identifies each individual's last name in most cases alongside firstName. There are 19 unique last names among the 23 records, indicating some may share surnames.\nPossible relationships with other columns: Often checked together with firstName to provide a full identity.\n\n---\n\nColumn: firstName\nPossible meaning: The personal o

In [None]:
final_agent = AssistantAgent(name="assistant",
                             model_client=instruct_client_config,
                             system_message="""<purpose>Review the data dictionary generated by the file handling agents and suggest possible relationships between tables.</purpose>

<instruction>
Carefully process the data dictionary generated by the all file handling agents step-by-step.
Suggest possible relationships between tables.
Point out the reasons for suggesting the relationships.
Point out which columns can be used to join the tables.
You must follow the output_format.
</instruction>

<output_format>
Possible relationships between tables: <relationship>
Reason: <reason>
Columns to join: <columns>
</output_format>

<rules>
OMIT performing any other actions other than those specified.
OMIT speaking more than necessary.
Must follow output_format.
</rules>"""
)

response = await final_agent.on_messages(
        [TextMessage(content=f"You are a data catalog agent. You are now reviewing the data dictionary generated by the file handling agents for the following files {files}. Suggest possible relationships between tables.", source="user")], None
    )
print(response)
return response

In [147]:
f'all your files: {files}'

"all your files: ['customer.csv', 'employee.csv', 'office.csv', 'orderdetails.csv', 'orders.csv', 'payments.csv', 'productlines.csv', 'products.csv']"