In [3]:
# Somehow in mac, it cant find local packages
# Run this to solve no local modules found in mac
import os
import sys

# Get the project root directory (assuming your notebook is in a subdirectory)
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)


In [4]:
# Python lib
import asyncio
from pathlib import Path
import pandas as pd
from tqdm import tqdm
from typing import Dict, Any, Tuple, List
import venv
import os

# Autogen-0.4
from autogen_agentchat.agents import AssistantAgent
from autogen_agentchat.agents._code_executor_agent import CodeExecutorAgent
from autogen_agentchat.conditions import TextMentionTermination, MaxMessageTermination
from autogen_agentchat.messages import TextMessage
from autogen_agentchat.teams import RoundRobinGroupChat
from autogen_agentchat.ui import Console
from autogen_ext.models.openai import OpenAIChatCompletionClient
from autogen_ext.models.openai.config import OpenAIClientConfigurationConfigModel
from autogen_core.model_context import BufferedChatCompletionContext

# Local
from utils import get_dataset_profile, get_columns_sample, initialize_individual_chat, _fix_file_name, jsonify_prompt
from prompts import data_dict_summarizer_prompt, data_dict_generator_prompt

In [5]:
# Uses utils/get_columns_sample
root = '../sheets/mysql/'
files: list[str] = os.listdir(path=root)
tasks = [get_columns_sample(root, file_name = file) for file in files]
results = await asyncio.gather(*tasks)
result_dict = zip(files, results)
results = list(result_dict)

In [23]:
# Only edit here AND filepath under if __name__ == "__main__":
data_dict_generator = "qwen2.5:32b-instruct-q8_0"
data_dict_summarizer_ds = "deepseek-r1:32b-qwen-distill-q8_0-131072"

# Common config
llm_base_url = "http://34.204.63.234:11434/v1"
api_key = "none"
model_info_generator =  {
        "vision": False,
        "function_calling": False,
        "json_output": True,
        "family": "Qwen2.5"
    }
model_info_summarizer =  {
        "vision": False,
        "function_calling": False,
        "json_output": True,
        "family": "DeepSeek-r1"
    }

# Create reasoning config
# Just generate the data dict based on obervered facts
data_dict_generator_config = OpenAIClientConfigurationConfigModel(
    frequency_penalty=0.4, 
    logit_bias=None, 
    max_tokens=2048, 
    n=None, 
    presence_penalty=0.3, 
    response_format={"type": "json_object"},
    seed=42, 
    stop=None, 
    temperature=0.2, 
    top_p=0.7, 
    user=None,
    model=data_dict_generator,
    api_key='none', 
    timeout=None, 
    max_retries=None, 
    model_info=model_info_generator, 
    organization=None, 
    base_url='http://34.204.63.234:11434/v1'
    )

# Need to be more creative
data_dict_summarizer_config = OpenAIClientConfigurationConfigModel(
    frequency_penalty=0.2, 
    logit_bias=None, 
    max_tokens=128000, 
    n=None, 
    presence_penalty=0.5, 
    response_format={"type": "json_object"}, 
    seed=42, 
    stop=None, 
    temperature=0.8, 
    top_p=0.95, 
    user=None,
    model=data_dict_summarizer_ds,
    api_key='none', 
    timeout=None, 
    max_retries=None, 
    model_info=model_info_summarizer, 
    organization=None, 
    base_url='http://34.204.63.234:11434/v1'
    )



In [24]:
data_dict_generator_client = OpenAIChatCompletionClient(model=data_dict_generator, model_info=model_info_generator, api_key=api_key)._from_config(data_dict_generator_config)
data_dict_summarizer_client = OpenAIChatCompletionClient(model=data_dict_summarizer_ds, model_info=model_info_summarizer, api_key=api_key)._from_config(data_dict_summarizer_config)

In [11]:
print(f"results[0][0] is {results[0][0]} (file name)\nresults[0][1] is metadata {type(results[0][1])}")

results[0][0] is products.csv (file name)
results[0][1] is metadata <class 'dict'>


In [12]:
# Create a list of tasks for all agents
tasks = [initialize_individual_chat(filename=results[index][0], metadata=results[index][1], data_dict_generator_client=data_dict_generator_client) for index in range(len(results))]

# Execute all agent chat tasks concurrently
responses = await asyncio.gather(*tasks)
print(responses)


[Response(chat_message=TextMessage(source='File_handler_products_csv', models_usage=RequestUsage(prompt_tokens=828, completion_tokens=862), content='{\n  "Data Dictionary": {\n    "filename": "products.csv",\n    "columns": [\n      {\n        "Column": "productCode",\n        "Description": "Unique code assigned to each product.",\n        "Format": "Identifier: Starts with \'S\' followed by numbers like S10_1678.",\n        "Nullable": false,\n        "Sample Values": ["S10_1678", "S10_1949", "S10_2016"]\n      },\n      {\n        "Column": "productName",\n        "Description": "Name of the product.",\n        "Format": "Text: Contains free-form text strings like \'1969 Harley Davidson Ultimate Chopper\'.",\n        "Nullable": false,\n        "Sample Values": ["1969 Harley Davidson Ultimate Chopper", "1952 Alpine Renault 1300", "1996 Moto Guzzi 1100i"]\n      },\n      {\n        "Column": "productLine",\n        "Description": "Category or line to which the product belongs.",\n  

In [15]:
# Save log
print(f"\nResponses received: {len(responses)}\n'responses' is a list[Response()]\nAccess the content with responses[index].chat_message.content\n")

if 'qwen' in data_dict_summarizer_config.model and '32768' in data_dict_summarizer_config.model:
    # Save to generation_log/qwen
    responses_json, filepath = await jsonify_prompt(responses, directory='generation_log/qwen_32768')
if 'deepseek' in data_dict_summarizer_config.model and 'qwen' in data_dict_summarizer_config.model:
    # Save to generation_log/deepseek_qwen
    responses_json, filepath = await jsonify_prompt(responses, directory='generation_log/deepseek_qwen')
else:
    print(f'No folder for {data_dict_summarizer_config.model}')



Responses received: 8
'responses' is a list[Response()]
Access the content with responses[index].chat_message.content
Data saved to: generation_log/deepseek_qwen/Log_202502031525.log


In [45]:
# Combine responses and initial task into a list
#initial_task = [TextMessage(content=f"Review all of the data dictionaries.",source="user")]

responses_json[-1]


{'Data Dictionary': {'filename': 'orderdetails.csv',
  'columns': [{'Column': 'orderNumber',
    'Description': 'The unique identifier for each order.',
    'Format': 'Identifier: Starts with a number, no specific pattern beyond that.',
    'Nullable': False,
    'Sample Values': ['10100', '10100', '10100']},
   {'Column': 'productCode',
    'Description': 'The unique identifier for each product in the order.',
    'Format': "Identifier: Starts with 'S', followed by two digits, underscore and four more digits.",
    'Nullable': False,
    'Sample Values': ['S18_1749', 'S18_2248', 'S18_4409']},
   {'Column': 'quantityOrdered',
    'Description': 'The quantity of the product ordered.',
    'Format': 'Numeric: Integer values representing quantities.',
    'Nullable': False,
    'Sample Values': ['30', '50', '22']},
   {'Column': 'priceEach',
    'Description': 'The price for each unit of the product in the order.',
    'Format': 'Numeric: Decimal numbers indicating prices.',
    'Nullable

In [46]:
# Initialize data dict ingestion agent
final_agent = AssistantAgent(name="Data_Dictionary_Analytics_Suggester",
                             model_client=data_dict_summarizer_client,
                             system_message=data_dict_summarizer_prompt()
)

response = await final_agent.on_messages(
        [TextMessage(content=f"{responses_json}\n\nReview the above data dictionary. First by stating the list of data dictionaries received.\n\n",source="user")], None
    )
response

APITimeoutError: Request timed out.

In [22]:
print(response.chat_message.content)

<think>
Alright, so I need to analyze these data dictionaries and come up with some analytics use cases. Let me start by reviewing each dataset provided.

First, there's products.csv which includes details about each product like productCode, productName, productLine, and more financial metrics like buyPrice and MSRP. This seems useful for understanding product performance and profitability.

Next is orders.csv, containing order information such as orderNumber, dates (orderDate, requiredDate, shippedDate), status, comments, and customerNumber. This can help in analyzing the order fulfillment process and customer behavior related to orders.

Employee.csv has data about employees including their contact info, office details, job titles, and who they report to. This could be used for workforce analytics or understanding organizational structure.

Office.csv provides information about different offices like city, phone, address lines, state, country, postal code, and territory. Useful for 

In [85]:
from utils import jsonify_prompt
if 'qwen' in data_dict_summarizer_config.model and '32768' in data_dict_summarizer_config.model:
    await jsonify_prompt(responses, directory='generation_log/qwen_32768')
if 'deepseek' in data_dict_summarizer_config.model and 'qwen' in data_dict_summarizer_config.model:
    await jsonify_prompt(responses, directory='generation_log/deepseek_qwen')
else:
    print(f'No folder for {data_dict_summarizer_config.model}')


TypeError: jsonify_prompt() got an unexpected keyword argument 'directory'

Data saved to: generation_log/Log_202502031444.log


([{'Data Dictionary': {'filename': 'products.csv',
    'columns': [{'Column': 'productCode',
      'Description': 'Unique code assigned to each product.',
      'Format': "Identifier: Starts with 'S' followed by numbers like S10_1678.",
      'Nullable': False,
      'Sample Values': ['S10_1678', 'S10_1949', 'S10_2016']},
     {'Column': 'productName',
      'Description': 'Name of the product.',
      'Format': "Text: Contains free-form text strings like '1969 Harley Davidson Ultimate Chopper'.",
      'Nullable': False,
      'Sample Values': ['1969 Harley Davidson Ultimate Chopper',
       '1952 Alpine Renault 1300',
       '1996 Moto Guzzi 1100i']},
     {'Column': 'productLine',
      'Description': 'Category or line to which the product belongs.',
      'Format': "Categorical: Represents categories such as 'Motorcycles' and 'Classic Cars'.",
      'Nullable': False,
      'Sample Values': ['Motorcycles', 'Classic Cars', 'Motorcycles']},
     {'Column': 'productScale',
      'Desc