In [16]:
import os
import sys

# Get the project root directory (assuming your notebook is in a subdirectory)
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

# Python lib
import asyncio
from pathlib import Path
import pandas as pd
from tqdm import tqdm
from typing import Dict, Any, Tuple
import venv

# Autogen-0.4
from autogen_agentchat.agents import AssistantAgent
from autogen_agentchat.agents._code_executor_agent import CodeExecutorAgent
from autogen_agentchat.conditions import TextMentionTermination, MaxMessageTermination
from autogen_agentchat.messages import TextMessage
from autogen_agentchat.teams import RoundRobinGroupChat
from autogen_ext.code_executors.local import LocalCommandLineCodeExecutor
from autogen_ext.models.openai import OpenAIChatCompletionClient
from autogen_core.model_context import BufferedChatCompletionContext
from autogen_agentchat.ui import Console

# Local
from utils import CopyFile, LoadDataset, GetDatasetProfile, Spinner
from prompts import cleaning_reasoning_prompt, cleaning_checking_prompt


# Only edit here AND filepath under if __name__ == "__main__":
reasoning_model = "qwen2.5:32b-instruct-q8_0"
coding_model = "qwen2.5-coder:32b-instruct-q8_0"

# Common config
llm_base_url = "http://34.204.63.234:11434/v1"
api_key = "none"
capabilities =  {
        "vision": False,
        "function_calling": False,
        "json_output": False
    }

#######################################################################
#   !!! DONT EDIT BELOW EXCEPT FOR if __name__ == "__main__":   !!!   #
#######################################################################

# Reasoning Model Configuration
instruct_client_config = OpenAIChatCompletionClient(
    model=reasoning_model,
    base_url=llm_base_url,
    api_key=api_key,
    model_capabilities=capabilities
)

# Coding Model Configuration
code_client_config = OpenAIChatCompletionClient(
    model=coding_model,
    base_url=llm_base_url,
    api_key=api_key,
    model_capabilities=capabilities
)

async def create_cleaning_agents(filepath: Path) -> Tuple[AssistantAgent, AssistantAgent]:
    
    # tqdm progress bar
    with tqdm(total=3,
             desc="Creating cleaning reasoning team agents",
             bar_format='{desc:>30}{postfix: <1} {bar}|{n_fmt:>4}/{total_fmt:<4}',
             colour='green') as pbar:
        
        ### 1.1 Cleaning reasoning agent - 1
        async def create_cleaning_reasoning_1():
            Data_Cleaning_Planner = AssistantAgent(
                name="Data_Cleaning_Planner",
                model_client=instruct_client_config,
                system_message=cleaning_reasoning_prompt(filepath),
            )
            pbar.update(1)
            return Data_Cleaning_Planner
        
        async def create_cleaning_reasoning_2():
            Validation_Assistant = AssistantAgent(
                name="Validation_Assistant",
                model_client=instruct_client_config,
                system_message=cleaning_checking_prompt(filepath),
            )
            pbar.update(1)
            return Validation_Assistant
        
        list_of_cleaning_reasoning_agents = await asyncio.gather(
            create_cleaning_reasoning_1(),
            create_cleaning_reasoning_2(),
        )
        pbar.update(1)
        return list_of_cleaning_reasoning_agents

In [17]:
async def cleaning_reasoning_pipeline(data_dict: Dict[str, Any], filepath: Path):
    """
    Run the complete data cleaning and transformation pipeline
    """
    try:
        # cleaning_reasoning_agent, 
        cleaning_reasoning_team = await create_cleaning_agents(filepath)
        # cleaning_team = [cleaning_team[0], cleaning_team[1]]
        
        # Setup termination conditions
        status_pass = TextMentionTermination("Overall Status: Pass")
        max_round = MaxMessageTermination(5)
        termination = status_pass | max_round

        # First phase: Data Cleaning (Reasoning)
        cleaning_team_chat = RoundRobinGroupChat(
            cleaning_reasoning_team,
            termination_condition=termination,
        )
        
        # Save last n messages
        chat_history = BufferedChatCompletionContext(buffer_size=1)
        
        # A loading spinner to know if the code is frozen or not
        #run_chat =  Spinner.async_with_spinner(
        #    message="Loading: ",
        #    style="braille",
        #    console_class=Console,
        #    coroutine=cleaning_team_chat.run_stream(task=cleaning_reasoning_prompt(filepath, data_dict), cancellation_token=None)
        #)
        
        # Uncomment below to run the code without spinner
        stream = cleaning_team_chat.run_stream(task=cleaning_reasoning_prompt(filepath, data_dict), cancellation_token=None)
        # await context.add_message(await Console(stream))
        await chat_history.add_message(await Console(stream))
        return chat_history
    except Exception as e:
        raise Exception(f"An error occurred: {str(e)}")

In [18]:
with tqdm(total=3,
              desc="Preparing dataset",
              bar_format='{desc:>30}{postfix: <1} {bar}|{n_fmt:>4}/{total_fmt:<4}',
              colour='green') as pbar:
        # Edit file path
        test_file = Path("/Users/jamesbond/Desktop/VSCode/AutoInsight/autogen_v2/sheets/credit_card_transactions.csv")
        pbar.update(1)
        df = LoadDataset(test_file)
        pbar.update(1)
        # Custom function in utils/ to get data dict in markdown/natural language/json format
        initial_profile = GetDatasetProfile(df, output_format="json")
        pbar.update(1)

         Processing 24 columns  [32m██████████[0m|  24/24  
             Preparing dataset  [32m██████████[0m|   3/3   


In [19]:
chat = await cleaning_reasoning_pipeline(initial_profile, test_file)


Creating cleaning reasoning team agents  [32m██████████[0m|   3/3   

---------- user ----------
<Data_Cleaning_Planner>

<purpose>
    Generate comprehensive and actionable data cleaning recommendations tailored to each column, incorporating domain-specific insights.
</purpose>

<dataset_location>
    Filepath: /Users/jamesbond/Desktop/VSCode/AutoInsight/autogen_v2/sheets/credit_card_transactions.csv
</dataset_location>

<data_dictionary>
    {'Unnamed: 0': {'dtype': 'int64', 'sample': 0, 'total_count': 1296675, 'null_count': 0, 'null_percentage': '0.0%', 'unique_count': 1296675, 'mean': np.float64(648337.0), 'median': np.float64(648337.0), 'std': np.float64(374317.97), 'min': np.int64(0), 'max': np.int64(1296674)}, 'trans_date_trans_time': {'dtype': 'object', 'sample': '2019-01-01 00:00:18', 'total_count': 1296675, 'null_count': 0, 'null_percentage': '0.0%', 'unique_count': 1274791}, 'cc_num': {'dtype': 'int64', 'sample': 2703186189652095, 'total_count': 1296675, 'null_count': 0, 'null_percentage': '0.0%', 'unique_count': 983, 'mean': np.float64(4.1719




---------- Data_Cleaning_Planner ----------
### Column_Name: merch_zipcode
- **Issue Type:** Missing Values
- **Recommended Action:** Impute missing values with the median zipcode of the column.
- **Brief Reason:** Using the median helps maintain a central value for zipcodes and minimizes distortion caused by outliers or gaps in data.
- **Dependencies/Constraints:** Ensure that median imputation aligns with regional distribution to avoid introducing significant bias.

### Column_Name: lat, lon (considering inter-column relationship)
- **Issue Type:** Outliers
- **Recommended Action:** Remove or correct geographic outliers based on standard deviation from the mean latitude and longitude coordinates.
- **Brief Reason:** This action helps ensure that the location data accurately reflects real-world positions by removing extreme values that could skew analysis results.
- **Dependencies/Constraints:** Ensure removal of outliers does not significantly reduce sample size.

### Column_Name: me

In [20]:
chat_history = await chat.get_messages()
chat_history[-1].messages[-1]

