In [37]:
# Somehow in mac, it cant find local packages
# Run this to solve no local modules found in mac
import os
import sys

# Get the project root directory (assuming your notebook is in a subdirectory)
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)


In [75]:
# Python lib
import asyncio
from pathlib import Path
import pandas as pd
from tqdm import tqdm
from typing import Dict, Any, Tuple, List
import venv
import os

# Autogen-0.4
from autogen_agentchat.agents import AssistantAgent
from autogen_agentchat.agents._code_executor_agent import CodeExecutorAgent
from autogen_agentchat.conditions import TextMentionTermination, MaxMessageTermination
from autogen_agentchat.messages import TextMessage
from autogen_agentchat.teams import RoundRobinGroupChat
from autogen_agentchat.ui import Console
from autogen_ext.models.openai import OpenAIChatCompletionClient
from autogen_ext.models.openai.config import OpenAIClientConfigurationConfigModel, ResponseFormat
from autogen_core.model_context import BufferedChatCompletionContext

# Local
from utils import GetDatasetProfile

In [39]:
root = '../sheets/mysql/'
files: list[str] = os.listdir(path=root)
tasks = [GetDatasetProfile(root, file_name = file, output_format='json') for file in files]
results = await asyncio.gather(*tasks)

Processing 9 columns (products.csv)                                    [32m██████████████████████████████[0m|   9/9   
Processing 7 columns (orders.csv)                                      [32m██████████████████████████████[0m|   7/7   
Processing 8 columns (employee.csv)                                    [32m██████████████████████████████[0m|   8/8   
Processing 9 columns (office.csv)                                      [32m██████████████████████████████[0m|   9/9   
Processing 13 columns (customer.csv)                                   [32m██████████████████████████████[0m|  13/13  
Processing 4 columns (productlines.csv)                                [32m██████████████████████████████[0m|   4/4   
Processing 4 columns (payments.csv)                                    [32m██████████████████████████████[0m|   4/4   
Processing 5 columns (orderdetails.csv)                                [32m██████████████████████████████[0m|   5/5   


In [40]:
result_dict = zip(files, results)
results = list(result_dict)

In [109]:
# Only edit here AND filepath under if __name__ == "__main__":
reasoning_model = "qwen2.5:32b-instruct-q8_0"
coding_model = "qwen2.5-coder:32b-instruct-q8_0"

# Common config
llm_base_url = "http://34.204.63.234:11434/v1"
api_key = "none"
capabilities =  {
        "vision": False,
        "function_calling": False,
        "json_output": True,
        "family": "Qwen2.5"
    }

#######################################################################
#   !!! DONT EDIT BELOW EXCEPT FOR if __name__ == "__main__":   !!!   #
#######################################################################

# Reasoning Model Configuration
instruct_client_config = OpenAIChatCompletionClient(
    model=reasoning_model,
    base_url=llm_base_url,
    api_key=api_key,
    model_capabilities=capabilities
)

# Coding Model Configuration
code_client_config = OpenAIChatCompletionClient(
    model=coding_model,
    base_url=llm_base_url,
    api_key=api_key,
    model_capabilities=capabilities
)

# Create reasoning client with config
reasoning_model_client = instruct_client_config._from_config(OpenAIClientConfigurationConfigModel(
    frequency_penalty=0.2, 
    logit_bias=None, 
    max_tokens=128000, 
    n=None, 
    presence_penalty=0.5, 
    response_format=None, 
    seed=42, 
    stop=None, 
    temperature=0.7, 
    top_p=0.95, 
    user=None,
    model='qwen2.5:32b-instruct-q8_0',
    api_key='none', 
    timeout=None, 
    max_retries=None, 
    model_capabilities=None, 
    model_info=capabilities, 
    organization=None, 
    base_url='http://34.204.63.234:11434/v1'
    )
)

json_reasoning_model_client = reasoning_model_client._from_config(OpenAIClientConfigurationConfigModel(
    frequency_penalty=None, 
    logit_bias=None, 
    max_tokens=None, 
    n=None, 
    presence_penalty=None, 
    response_format={"type": "json_object"},
    seed=42, 
    stop=None, 
    temperature=0.2, 
    top_p=0.9, 
    user=None,
    model='qwen2.5:32b-instruct-q8_0',
    api_key='none', 
    timeout=None, 
    max_retries=None, 
    model_capabilities=None, 
    model_info=capabilities, 
    organization=None, 
    base_url='http://34.204.63.234:11434/v1'
    )
)


In [42]:
results[0][0]

'products.csv'

In [43]:
async def fix_file_name(filename):
    # Turn file name into Python identifier
    # Since Python identifier can't contain '.', we replace it with _
    last_5_chars = list(filename)[-5:]
    replace_dot = [char.replace('.', '_') for char in last_5_chars]
    all_chars_before_last_5 = list(filename)[:-5]
    all_chars_before_last_5.extend(replace_dot)
    return ''.join(all_chars_before_last_5)

In [44]:
await fix_file_name(results[0][0])

'products_csv'

In [84]:

async def initialize_individual_chat(filename: str, metadata):
    
    file_name_fixed = await fix_file_name(filename)
    
    # Initialize agents dynamically
    file_handler = AssistantAgent(
        name=f"File_handler_{file_name_fixed}",
        description=f"A file handling agent specific for the file {filename}.",
        model_client=json_reasoning_model_client,
        system_message=f"""<purpose>You are a file handling agent for the file {filename}. You will populate a data dictionary for this particular dataset.</purpose>
        
<instructions>Given the metadata of the file, suggest possible meanings and explain abbreviations of columns in the file. Suggest common column names that would be analyzed together. MUST follow output_format.</instructions>

<output_format>
Data Dictionary for {filename}:

---

Column: <column_name>
Possible meaning: <meaning>
Description: <description>
Relationships with other columns: <relationships>
Sample value: <1 sample value>

---
</output_example>

<rules>
OMIT performing any other actions other than those specified.
OMIT speaking more than necessary.
Must follow output_format.
</rules>""",
    )
    
    response = await file_handler.on_messages(
        [TextMessage(content=f"You are a file handling agent for the file {filename}. The file metadata is as follow: {str(metadata)}. Suggest possible meanings and abbreviations of columns in the file.", source="user")], None
    )
    return response



In [85]:
# Create a list of tasks for all agents
tasks = [initialize_individual_chat(filename=results[index][0], metadata=results[index][1]) for index in range(len(results))]
    
# Execute all agent chat tasks concurrently
responses = await asyncio.gather(*tasks)

# Example usage
print(responses)
print(f"\nResponses received: {len(responses)}")

[Response(chat_message=TextMessage(source='File_handler_products_csv', models_usage=RequestUsage(prompt_tokens=928, completion_tokens=889), content='{\n"Data Dictionary for products.csv": [\n    {\n        "Column": "productCode",\n        "Possible meaning": "Unique identifier for each product.",\n        "Description": "A code assigned to uniquely identify a specific product within the inventory.",\n        "Relationships with other columns": "This column is likely used as a primary key and could be linked to sales or order data in other tables.",\n        "Sample value": "S10_1678"\n    },\n    {\n        "Column": "productName",\n        "Possible meaning": "Name of the product.",\n        "Description": "The name given to the specific item for sale, which can be used for identification and marketing purposes.",\n        "Relationships with other columns": "This column is related to \'productCode\' as each code corresponds to a unique product name.",\n        "Sample value": "1969 

In [113]:
responses[5].chat_message.content

'{\n"Data Dictionary for productlines.csv": [\n    {\n        "Column": "productLine",\n        "Possible meaning": "Category or type of products available.",\n        "Description": "This column lists different categories of products such as Classic Cars, Motorcycles, etc., which helps in identifying the broad range of items offered under each category.",\n        "Relationships with other columns": "It is likely related to \'textDescription\' and potentially to sales data or product details not shown here. The descriptions provide more information about what kind of products are included in each line.",\n        "Sample value": "Classic Cars"\n    },\n    {\n        "Column": "textDescription",\n        "Possible meaning": "Detailed textual description of the product line.",\n        "Description": "This column contains detailed text that describes the features, benefits, and characteristics of the products within a particular product line. It is used to provide customers with compre

In [116]:
import json
pydict = json.loads(responses[5].chat_message.content)
pydict

{'Data Dictionary for productlines.csv': [{'Column': 'productLine',
   'Possible meaning': 'Category or type of products available.',
   'Description': 'This column lists different categories of products such as Classic Cars, Motorcycles, etc., which helps in identifying the broad range of items offered under each category.',
   'Relationships with other columns': "It is likely related to 'textDescription' and potentially to sales data or product details not shown here. The descriptions provide more information about what kind of products are included in each line.",
   'Sample value': 'Classic Cars'},
  {'Column': 'textDescription',
   'Possible meaning': 'Detailed textual description of the product line.',
   'Description': 'This column contains detailed text that describes the features, benefits, and characteristics of the products within a particular product line. It is used to provide customers with comprehensive information about what they can expect from each category.',
   'Rel

In [129]:
# Extract class:: TextMessage from class:: Response
# Since agents only accept [TextMessage] as input prompt
responses_TextMessage = [json.loads(responses[index].chat_message.content) for index in range(len(responses))]
responses_TextMessage

[{'Data Dictionary for products.csv': [{'Column': 'productCode',
    'Possible meaning': 'Unique identifier for each product.',
    'Description': 'A code assigned to uniquely identify a specific product within the inventory.',
    'Relationships with other columns': 'This column is likely used as a primary key and could be linked to sales or order data in other tables.',
    'Sample value': 'S10_1678'},
   {'Column': 'productName',
    'Possible meaning': 'Name of the product.',
    'Description': 'The name given to the specific item for sale, which can be used for identification and marketing purposes.',
    'Relationships with other columns': "This column is related to 'productCode' as each code corresponds to a unique product name.",
    'Sample value': '1969 Harley Davidson Ultimate Chopper'},
   {'Column': 'productLine',
    'Possible meaning': 'Category or type of the product.',
    'Description': 'A broad category that groups similar products together, such as Motorcycles, Clas

In [97]:
prompt = """<Data_Dictionary_Analytics_Suggester>
    
    <purpose>
        Analyze provided data dictionaries comprehensively to identify column relationships and suggest a wide range of potential analytics use cases that leverage these relationships to generate meaningful and actionable business insights.
    </purpose>
    
    <instructions>
        1. **Review** all provided data dictionaries thoroughly, ensuring no dataset or column is overlooked.
        2. **Examine** each dataset's columns, their meanings, descriptions, and existing relationships in detail.
        3. **Identify** all meaningful relationships and combinations of columns within the same dataset and across different datasets that can be leveraged for insightful analysis.
        4. **Ensure** that every dataset and its relevant columns are considered to maximize the utilization of available data.
        5. **Brainstorm** a diverse range of potential analytics use cases that utilize these column relationships to address various business objectives such as improving sales, optimizing inventory, understanding customer behavior, enhancing operational efficiency, etc.
        6. **For each suggested use case**, provide a detailed explanation of how specific column combinations and their relationships contribute to the insights.
        7. **Ensure** that all suggested use cases are actionable, relevant, and based solely on the information provided in the data dictionaries without incorporating any external knowledge or assumptions.
        8. **Summarize** key relationships between datasets that underpin the proposed analytics use cases, ensuring that cross-dataset relationships are highlighted and utilized.
    </instructions>
    
    <output_format>
        For each identified analytics use case, provide the following structured information:
        
        ```
        ### Use Case <Number>
        - **Use Case ID:** UC<UniqueNumber>
        - **Title:** <Descriptive Title of the Use Case>
        - **Description:** <Detailed explanation of the analytics use case, including the business objective it addresses and the insights it aims to generate.>
        - **Data Sources:**
            - <Dataset1.csv>
            - <Dataset2.csv>
            - <!-- Add additional data sources as needed -->
        - **Columns Utilized:**
            - <Dataset1.csv>.<ColumnName>
            - <Dataset2.csv>.<ColumnName>
            - <!-- List all relevant columns from the respective datasets -->
        - **Relationships Leveraged:**
            - <Description of how the columns are related (e.g., primary key, foreign key, common attributes) and how these relationships are utilized in the analysis.>
        
        ---
        ```
        
        **Example:**
        
        ```
        ### Use Case 1
        - **Use Case ID:** UC001
        - **Title:** Inventory Optimization Based on Product Categories
        - **Description:** Analyze the `quantityInStock` in relation to `productLine` and `buyPrice` to identify overstocked or understocked categories, enabling better inventory management and cost optimization.
        - **Data Sources:**
            - products.csv
            - productlines.csv
        - **Columns Utilized:**
            - products.csv.quantityInStock
            - products.csv.buyPrice
            - products.csv.productLine
            - productlines.csv.productLine
        - **Relationships Leveraged:**
            - The `productLine` column in `products.csv` is related to the `productLine` column in `productlines.csv`, allowing categorization of inventory levels by product category.
        
        ---
        ```
    </output_format>
    
    <rules>
        1. **Exact Naming:** Use the exact table and column names as specified in the provided data dictionaries.
        2. **Relevance:** Only suggest analytics use cases that are directly supported by the relationships and data available in the data dictionaries.
        3. **Actionable Use Cases:** Ensure that each use case is actionable and aligned with common business objectives such as improving sales, optimizing inventory, understanding customer behavior, enhancing operational efficiency, etc.
        4. **Comprehensive Utilization:** Actively utilize as many datasets and columns as possible, ensuring that no relevant data is left unexplored or unused in the suggested use cases.
        5. **Clarity:** Provide clear and concise descriptions for each use case, avoiding ambiguity and ensuring that the purpose and methodology are easily understandable.
        6. **Structured Output:** Adhere strictly to the specified output format to maintain consistency and ease of interpretation across all suggested use cases.
        7. **No Assumptions:** Base all suggestions solely on the provided data dictionaries without introducing external information or making assumptions beyond the given data.
        8. **Unique Identification:** Assign a unique ID to each use case to facilitate easy reference and tracking.
        9. **Avoid Redundancy:** Ensure that each use case is unique and does not duplicate the purpose or methodology of another use case.
        10. **Cross-Dataset Relationships:** Actively seek and leverage relationships across different datasets to create comprehensive and insightful use cases that span multiple areas of the business.
    </rules>
    
</Data_Dictionary_Analytics_Suggester>"""



In [126]:
# Construct a list of TextMessage as prompt
# Combine data dict ingestion agent prompt with all previous responses
initial_task = [TextMessage(content=f"Review the data dictionary generated by the file handling agents.",source="user")]
responses_TextMessage.extend(initial_task)

In [130]:
responses_TextMessage

[{'Data Dictionary for products.csv': [{'Column': 'productCode',
    'Possible meaning': 'Unique identifier for each product.',
    'Description': 'A code assigned to uniquely identify a specific product within the inventory.',
    'Relationships with other columns': 'This column is likely used as a primary key and could be linked to sales or order data in other tables.',
    'Sample value': 'S10_1678'},
   {'Column': 'productName',
    'Possible meaning': 'Name of the product.',
    'Description': 'The name given to the specific item for sale, which can be used for identification and marketing purposes.',
    'Relationships with other columns': "This column is related to 'productCode' as each code corresponds to a unique product name.",
    'Sample value': '1969 Harley Davidson Ultimate Chopper'},
   {'Column': 'productLine',
    'Possible meaning': 'Category or type of the product.',
    'Description': 'A broad category that groups similar products together, such as Motorcycles, Clas

In [133]:
# Initialize data dict ingestion agent
final_agent = AssistantAgent(name="Data_Dictionary_Analytics_Suggester",
                             model_client=reasoning_model_client,
                             system_message=prompt
)

response = await final_agent.on_messages(
        [TextMessage(content=f"{responses_TextMessage}\n\nReview the above data dictionary.\n\n{responses_TextMessage}",source="user")], None
    )
response

Response(chat_message=TextMessage(source='Data_Dictionary_Analytics_Suggester', models_usage=RequestUsage(prompt_tokens=2048, completion_tokens=653), content='Based on your provided data, I have summarized the key details for you:\n\n1. **Data Dictionary for customers.csv**\n   - No specific columns from this file were listed, so no information is available here.\n\n2. **Data Dictionary for payments.csv**\n   - `customerNumber`: A unique identifier for each customer.\n     - Example: 103\n     - Relationships: Can be used to join with other tables containing more detailed customer info like names and addresses.\n   - `checkNumber`: A unique identifier for each payment check.\n     - Example: "HQ336336"\n     - Relationships: Used in conjunction with the \'paymentDate\' and \'amount\' columns to track individual payments over time.\n   - `paymentDate`: The date when a payment was made.\n     - Example: 2004-10-19\n     - Relationships: Can be used alongside the amount column for financi

In [134]:
print(response.chat_message.content)

Based on your provided data, I have summarized the key details for you:

1. **Data Dictionary for customers.csv**
   - No specific columns from this file were listed, so no information is available here.

2. **Data Dictionary for payments.csv**
   - `customerNumber`: A unique identifier for each customer.
     - Example: 103
     - Relationships: Can be used to join with other tables containing more detailed customer info like names and addresses.
   - `checkNumber`: A unique identifier for each payment check.
     - Example: "HQ336336"
     - Relationships: Used in conjunction with the 'paymentDate' and 'amount' columns to track individual payments over time.
   - `paymentDate`: The date when a payment was made.
     - Example: 2004-10-19
     - Relationships: Can be used alongside the amount column for financial analysis.
   - `amount`: The monetary value of each payment.
     - Example: 6066.78
     - Relationships: Useful in conjunction with 'paymentDate' to track revenue over time