In [1]:
%%capture
%pip install llama-index llama-index-embeddings-openai qdrant-client llama-index-vector-stores-qdrant llama-index llama-index-llms-openai llama-index-vector-stores-faiss faiss-cpu llama-index-llms-anthropic tavily-python llama-index-experimental

In [1]:
import os
import nest_asyncio
from getpass import getpass
from dotenv import load_dotenv

# Import LlamaIndex components
from llama_index.llms.openai import OpenAI
from llama_index.core.agent import FunctionCallingAgent
from llama_index.core.workflow import Event, Workflow, Context, StopEvent, step
from llama_index.core.workflow import StartEvent
import pandas as pd
import re
import functools
from llama_index.experimental.query_engine import PandasQueryEngine
from llama_index.core.workflow import Context
from llama_index.core.tools import FunctionTool

# Load environment variables and apply nest_asyncio for async operations
load_dotenv()
nest_asyncio.apply()

# Get API keys
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY') or getpass("Enter OPENAI_API_KEY: ")

# Initialize LLM
llm = OpenAI(model="gpt-4o-2024-11-20", api_key=OPENAI_API_KEY, temperature=0.5, max_tokens=512)

In [2]:
async def execute_pandas_query_tool(ctx: Context, query_str: str) -> str:
    """
    Executes a pandas query string against the DataFrame in the context.
    Handles both queries returning results and assignments/inplace modifications.

    Args:
        ctx (Context): The workflow context containing 'dataframe' and 'query_engine'.
        query_str (str): The pandas query/command to execute. Must use 'df'.
    """
    try:
        query_engine: PandasQueryEngine = await ctx.get("query_engine")
        df: pd.DataFrame = await ctx.get("dataframe") # Get current DataFrame

        if df is None:
             return "Error: DataFrame not found in context."
        if query_engine is None:
             # Modifications might still work without engine, but queries won't.
             # Let's require engine for consistency for now.
             return "Error: PandasQueryEngine not found in context."

        print(f"Tool executing query: {query_str}")

        # Heuristic to detect modification (may need refinement)
        is_modification = '=' in query_str or 'inplace=True' in query_str

        if is_modification:
            try:
                # Prepare execution environment for modification
                local_vars = {'df': df.copy()} # Use a copy to avoid modifying original during exec
                global_vars = {'pd': pd} # Provide pandas module

                # Execute the potentially multi-line modification code
                exec(query_str, global_vars, local_vars)

                # Get the modified DataFrame from the local scope of exec
                modified_df = local_vars['df']

                # Update the DataFrame in the workflow context
                await ctx.set("dataframe", modified_df)

                # Attempt to update the query engine's internal DataFrame state
                try:
                    # This assumes the experimental engine uses _df internally
                    query_engine._df = modified_df
                except AttributeError:
                     print("Warning: Could not directly update query_engine._df. Engine might use stale data for subsequent queries.")

                result = "Executed modification successfully."
                print(f"Tool modification result: {result}")
                return result

            except Exception as e:
                # Log the full traceback for debugging exec errors
                import traceback
                print(f"Tool exec error for query '{query_str}': {e}\n{traceback.format_exc()}")
                error_msg = f"Error executing modification '{query_str}': {e}"
                # Handle specific FutureWarning for fillna inplace
                if "FutureWarning" in str(e) and "fillna" in query_str and "inplace=True" in query_str:
                    alt_query_str = query_str.replace(".fillna(", "['Time'].fillna(").replace(", inplace=True)", "") # Basic attempt
                    alt_query_str = f"df['Time'] = df{alt_query_str}" # Assign back
                    print(f"Attempting alternative syntax for fillna: {alt_query_str}")
                    try:
                        local_vars = {'df': df.copy()}
                        global_vars = {'pd': pd}
                        exec(alt_query_str, global_vars, local_vars)
                        modified_df = local_vars['df']
                        await ctx.set("dataframe", modified_df)
                        query_engine._df = modified_df
                        result = "Executed modification successfully (alternative fillna syntax)."
                        print(f"Tool modification result: {result}")
                        return result
                    except Exception as e_alt:
                         print(f"Alternative fillna syntax failed: {e_alt}")
                         # Fall through to return original error message

                return error_msg # Return original error if no specific handling worked
        else:
            # Assume it's a query returning a result
            try:
                # The query engine should use the updated _df assigned above
                response = query_engine.query(query_str)
                result = str(response)
                # Check for known error patterns from the engine's response string
                if "error" in result.lower() and ("syntax" in result.lower() or "invalid" in result.lower() or "Traceback" in result):
                     error_msg = f"Query engine failed for '{query_str}': {result}"
                     print(error_msg)
                     return error_msg
                print(f"Tool query result: {result[:500]}...")
                return result
            except Exception as e:
                 import traceback
                 print(f"Error during query_engine.query('{query_str}'): {e}\n{traceback.format_exc()}")
                 error_msg = f"Error during query_engine.query('{query_str}'): {e}"
                 return error_msg

    except Exception as e:
        import traceback
        print(f"Tool general error processing query '{query_str}': {e}\n{traceback.format_exc()}")
        error_msg = f"Error processing query '{query_str}': {e}"
        return error_msg


In [9]:
# Add this code in a new cell after the 'execute_pandas_query_tool' definition cell

async def save_dataframe_tool(ctx: Context, file_path: str) -> str:
    """
    Saves the current DataFrame in the context to a CSV file.

    Args:
        file_path (str): The full path (including filename) where the CSV should be saved.
                         Example: 'C:/path/to/modified_data.csv'
    """
    try:
        # Retrieve the DataFrame from the context
        df: pd.DataFrame = await ctx.get("dataframe")
        if df is None:
            return "Error: DataFrame not found in context."

        # --- Ensure the directory exists ---
        output_dir = os.path.dirname(file_path)
        if output_dir: # Check if path includes a directory
             os.makedirs(output_dir, exist_ok=True)
        # --- End Ensure directory exists ---


        print(f"Tool attempting to save DataFrame to: {file_path}")
        # Save the DataFrame
        df.to_csv(file_path, index=False)
        result = f"DataFrame successfully saved to {file_path}"
        print(result)
        return result
    except Exception as e:
        error_msg = f"Error saving DataFrame to '{file_path}': {e}"
        print(error_msg)
        return error_msg



In [10]:

# Create the agents - 
def create_agents():
    """Create and return the agents needed for our workflow"""

   
    # Update docstring for the bound function if necessary, or rely on the original
    pandas_query_tool = FunctionTool.from_defaults(
        async_fn=execute_pandas_query_tool,  # <-- use async_fn
        name="execute_pandas_query_tool",
        description=execute_pandas_query_tool.__doc__
    )
    save_df_tool = FunctionTool.from_defaults(
        async_fn=save_dataframe_tool,        # <-- use async_fn
        name="save_dataframe_tool",
        description=save_dataframe_tool.__doc__
    )


    data_prep_agent = FunctionCallingAgent.from_tools(
        tools=[],
        llm=llm,
        verbose=False,
        system_prompt="You are a data preparation agent. Your job is to describe the necessary steps to clean, transform, and prepare data for analysis based on provided statistics. "
                     "You handle tasks like dealing with missing values, normalizing data, feature engineering, and ensuring data quality."
    )
    
    data_analysis_agent = FunctionCallingAgent.from_tools(
        tools=[pandas_query_tool, save_df_tool],
        llm=llm,
        verbose=True,
        # Updated system prompt
        system_prompt=(
            "You are a data analysis agent. Your job is to:\n"
            "1. Receive a data preparation description.\n"
            "2. Generate and execute pandas commands (using 'df') via the 'execute_pandas_query_tool' to perform the described cleaning/modifications (e.g., imputation, outlier handling, typo correction).\n"
            "3. Perform further analysis on the MODIFIED data using the 'execute_pandas_query_tool'.\n"
            "4. Generate a concise Markdown report summarizing:\n"
            "    - The cleaning/modification steps you executed.\n"
            "    - Key findings from your analysis of the modified data.\n"
            "5. Save the MODIFIED DataFrame to a new CSV file using the 'save_dataframe_tool'. Name the file by appending '_modified' to the original filename (e.g., if original was data.csv, save as data_modified.csv)."
        )
    )
    
    return data_prep_agent, data_analysis_agent

In [5]:

class DataPrepEvent(Event):
    original_path: str
    column_names: list[str]
    
class DataAnalysisEvent(Event):
    prepared_data_description: str
    original_path: str

# Define our multi-agent workflow
class DataAnalysisFlow(Workflow):
    
    @step
    async def setup(self, ctx: Context, ev: StartEvent) -> DataPrepEvent:
        """Initialize the agents and setup the workflow"""

       
        # --- Load data and create Pandas Query Engine ---
        try:
            df = pd.read_csv(ev.dataset_path)
            # Handle potential issues like missing values before creating the engine if needed
            # df = df.fillna('NA') # Example: fill NaNs, adjust as necessary
            
            # Create the query engine, passing the LLM is recommended
            query_engine = PandasQueryEngine(df=df, llm=llm, verbose=True) 
            
            # Store the DataFrame and query engine in the context
            await ctx.set("dataframe", df)
            await ctx.set("query_engine", query_engine)
            await ctx.set("original_path", ev.dataset_path)
            
            print(f"Successfully loaded {ev.dataset_path} and created PandasQueryEngine.")
            
            self.data_prep_agent, self.data_analysis_agent = create_agents()

            # Return event with basic info for the next step
            return DataPrepEvent(
                original_path=ev.dataset_path, 
                column_names=df.columns.tolist()
            )
        except Exception as e:
            print(f"Error during setup: Failed to load {ev.dataset_path} or create engine. Error: {e}")
            # Need to decide how to handle errors - perhaps raise or return a specific error event
            # For now, returning an empty event or raising might be options. Let's raise.
            raise ValueError(f"Setup failed: {e}")
        
    @step
    async def data_preparation(self, ctx: Context, ev: DataPrepEvent) -> DataAnalysisEvent:
        """Use the data prep agent to suggest cleaning/preparation based on schema."""
        query_engine: PandasQueryEngine = await ctx.get("query_engine")
        df: pd.DataFrame = await ctx.get("dataframe")
        
        # Get more comprehensive initial info using the engine
        try:
            # --- CHANGE: Use describe(include='all') ---
            initial_info = str(query_engine.query("Show the shape of the dataframe (number of rows and columns) and the output of df.describe(include='all')"))
            print(f"--- Initial Info for Prep Agent ---\n{initial_info}\n------------------------------------") 
        except Exception as e:
            print(f"Warning: Could not query initial info from engine: {e}")
            initial_info = f"Columns: {ev.column_names}"

        # Ask the data preparation agent for preparation steps/description
        # --- CHANGE: Refined Prompt ---
        prep_prompt = (
            f"The dataset (from {ev.original_path}) has the following shape and summary statistics:\n{initial_info}\n\n" 
            f"Based *only* on these statistics, describe the necessary data preparation steps. "
            f"Specifically mention potential issues like outliers (e.g., in 'Distance' max value), missing values (e.g., count mismatch in 'Time'), "
            f"and data quality issues in categorical columns (e.g., unique count vs expected for 'Mode', potential typos like 'Bas', 'Cra', 'Walt'). "
            f"Suggest specific actions like imputation for 'Time', outlier investigation for 'Distance', and checking unique values/correcting typos in 'Mode'. "
            f"Focus on describing *what* needs to be done and *why* based *strictly* on the provided stats. If no issues are apparent from the stats, state that clearly. ALWAYS provide a description."
        )
        result = self.data_prep_agent.chat(prep_prompt)
        
        prepared_data_description = None # Initialize
        if hasattr(result, 'response'):
            prepared_data_description = result.response
            # --- CHANGE: Add check for None/empty response ---
            if not prepared_data_description: 
                prepared_data_description = "Agent returned an empty description despite the prompt."
                print("Warning: Agent response attribute was empty.")
            # --- END CHANGE ---
        else:
            # Fallback in case the response structure is different
            prepared_data_description = "Could not extract data preparation description from agent response."
            print(f"Warning: Agent response does not have expected 'response' attribute. Full result: {result}")
        
        # Optional: Print the description generated by the agent
        print(f"--- Prep Agent Description Output ---\n{prepared_data_description}\n------------------------------------")
        
        await ctx.set("prepared_data_description", prepared_data_description)
        
        return DataAnalysisEvent(
            prepared_data_description=prepared_data_description,
            original_path=ev.original_path
        )
    
    @step
    async def data_analysis(self, ctx: Context, ev: DataAnalysisEvent) -> StopEvent:
        """Use the analysis agent to perform modifications, analyze, report, and save."""
   
        # Construct the path for the modified file
        original_path = await ctx.get("original_path")
        path_parts = os.path.splitext(original_path)
        modified_file_path = f"{path_parts[0]}_modified{path_parts[1]}"

        # Prompt the agent to perform analysis using its tool
        analysis_request = (
            f"The dataset originally came from: {original_path}\n"
            f"Here is the data preparation description outlining necessary changes:\n"
            f"<preparation_description>\n{ev.prepared_data_description}\n</preparation_description>\n\n"
            f"Now, please perform the following actions:\n"
            f"1. Execute the necessary pandas commands (using 'df') to apply the cleaning/modifications described above. Use the 'execute_pandas_query_tool'. Handle potential errors gracefully.\n"
            f"2. After modifying the data, perform a brief analysis focusing on the impact of the changes (e.g., check 'Time' description after imputation, 'Mode' unique values after correction, effect of handling 'Distance' outliers if any were addressed). Use the 'execute_pandas_query_tool'.\n"
            f"3. Generate a Markdown report summarizing the modifications performed and the key analysis findings.\n"
            f"4. Save the modified DataFrame to the following path using the 'save_dataframe_tool': '{modified_file_path}'"
        )

        print(f"--- Prompting Data Analysis Agent ---\n{analysis_request}\n------------------------------------")

        # The agent will now use the tool internally based on the prompt
        agent_response = self.data_analysis_agent.chat(analysis_request)

        # Extract the final report text
        final_report = "Agent did not provide a valid report."
        if hasattr(agent_response, 'response') and agent_response.response:
             final_report = agent_response.response
             # The agent's response should ideally be the Markdown report
        else:
             print(f"Warning: Agent response might not be the expected report. Full result: {agent_response}")
             final_report = str(agent_response) # Fallback

        print(f"--- Data Analysis Agent Final Response (Report) ---\n{final_report}\n------------------------------------------")

        # Store the report
        await ctx.set("final_report", final_report)

        # Return a StopEvent with the final report
        # The DataFrame saving is handled by the agent via the tool
        return StopEvent(result={
            "final_report": final_report
        })
    

In [6]:

async def run_workflow(dataset_path):
    """Run the data analysis workflow on the given dataset"""

    # Initialize the workflow
    # Increased timeout as query engine + LLM calls can take longer
    workflow = DataAnalysisFlow(timeout=240, verbose=True) # Increased timeout further

    # Run the workflow - still pass dataset_path to StartEvent
    try:
        # Agents are created in setup
        handler = workflow.run(
            dataset_path=dataset_path,
        )

        # Get the final result
        final_result_dict = await handler

        print("\n==== Final Report ====")
        # Adjust keys based on the StopEvent result dictionary from the modified data_analysis step
        final_report = final_result_dict.get('final_report', 'N/A')

        print(final_report) # Print the Markdown report

        return final_result_dict # Return the dictionary containing the report
    except Exception as e:
         print(f"Workflow failed: {e}")
         # --- Add traceback for debugging ---
         import traceback
         traceback.print_exc()
         # --- End traceback ---
         return None

In [7]:
# Example: Run the workflow with a CSV file
# You'll need to update this with your actual dataset path
dataset_path = r"C:\Users\anteb\Desktop\Courses\Projects\data_analysis_ai\data_analysis_agent\Commute_Times_V1.csv"

# Execute the workflow
await run_workflow(dataset_path)

Running step setup
Successfully loaded C:\Users\anteb\Desktop\Courses\Projects\data_analysis_ai\data_analysis_agent\Commute_Times_V1.csv and created PandasQueryEngine.
Step setup produced event DataPrepEvent
Running step data_preparation
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
> Pandas Instructions:
```
(df.shape, df.describe(include='all'))
```
> Pandas Output: ((281, 4),               Case Mode    Distance        Time
count   281.000000  281  281.000000  278.000000
unique         NaN    9         NaN         NaN
top            NaN  Car         NaN         NaN
freq           NaN   84         NaN         NaN
mean    140.978648  NaN    3.658007   19.622302
std      81.287714  NaN    8.206031   13.720435
min       1.000000  NaN    0.200000    2.000000
25%      71.000000  NaN    1.700000   10.000000
50%     141.000000  NaN    3.000000   16.000000
75%     211.0

{'final_report': 'It seems I am unable to proceed with these operations because the required context is missing. Could you please ensure that the dataset is properly loaded into the context so I can perform the modifications?'}