<a href="https://colab.research.google.com/github/DonErnesto/data-agent/blob/master/notebooks/01_experiment_simple_agent_loop.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
# --- Colab Bootstrap for data-agent ---
clone_github = False
install_dependencies = False
install_e = False

import os
import sys
import pandas as pd
from google.colab import userdata

api_key = userdata.get('OPENAI_API_KEY')
if api_key:
    os.environ["OPENAI_API_KEY"] = api_key
    print("✅ OPENAI_API_KEY loaded")
else:
    print("⚠️ OPENAI_API_KEY not found in Colab secrets")


# Get GitHub token from Colab secrets
github_token = userdata.get('GITHUB_TOKEN')

if clone_github:
    if github_token:
        !rm -rf /content/data-agent   # remove old copy if it exists
        # Use the token for authentication and specify the target directory
        !git clone https://{github_token}@github.com/DonErnesto/data-agent.git /content/data-agent
        %cd /content/data-agent
        print("✅ Repository cloned successfully")
    else:
        print("❌ GITHUB_TOKEN not found in Colab secrets. Please add it to clone the repository.")

if install_dependencies:
    !pip install -r requirements-colab.txt
    print("✅ Dependencies installed")

if install_e:
    !pip install -e .
    print("✅ data_agent package installed in editable mode")
else:
    # Add the explicit path to the src directory
    src_path = "/content/data-agent/src"
    if src_path not in sys.path:
      sys.path.append(src_path)
      print(f"✅ {src_path} added to sys.path")
    else:
      print(f"ℹ️ {src_path} already in sys.path")


# Test import - moved outside try/except to ensure global availability
try:
    from agent.agent import Agent
    print("✅ Agent imported successfully")
    # Comment out the example usage for now to isolate the import test
    # print("✅ Repo bootstrap complete. Example:", Agent().step())
except ImportError as e:
    print(f"❌ ImportError: {e}. Ensure the import path 'agent.agent' is correct and the necessary files (__init__.py, agent.py) exist.")



✅ OPENAI_API_KEY loaded
✅ /content/data-agent/src added to sys.path
✅ Agent imported successfully


.:
notebooks  requirements-colab.txt  setup.py  tests
README.md  requirements.txt	   src

./notebooks:
01_experiment_simple_agent_loop.ipynb  01_initial_experiments.ipynb

./src:
agent  data_agent.egg-info  utils

./src/agent:
actions.py  agent.py  goals.py	__init__.py

./src/data_agent.egg-info:
dependency_links.txt  PKG-INFO	SOURCES.txt  top_level.txt

./src/utils:
__init__.py  io_utils.py

./tests:
test_agent.py


In [3]:
## Concrete use case: describing data interactively

"""
Simple loop:
- User asks a question about the data
- Agent returns the argument a pandas function to answer that question
- User returns the pandas results
- Agent describes the data and terminates.

TO-DO's:
+ we don't want to see the results from pandas. We want to see how the LLM describes them

"""

import json
import os
from typing import List

from litellm import completion
from pandas import DataFrame

def describe_dataframe(df: DataFrame) -> str:
    """Describe the contents of a pandas DataFrame."""
    return df.describe().to_string()

def list_column_names_of_dataframe(df: DataFrame) -> List[str]:
    """List column names of a pandas DataFrame."""
    return list(df.columns)

def show_datatype_of_column(column_name: str) -> str:
    """Show the datatype of a column in a pandas DataFrame."""
    return str(df[column_name].dtype)

def describe_column(column_name: str) -> str:
    """Describe the contents of a column in a pandas DataFrame."""
    return df[column_name].describe().to_string()

def translate_pd_to_human(message) -> None:
    """ Translate the pandas results into a human-readable text.
    This will terminate the loop.
    """
    print(f"The pandas results can be described as follows: {message}")


tool_functions = {
    "list_column_names_of_dataframe": list_column_names_of_dataframe,
    "describe_dataframe": describe_dataframe,
    "show_datatype_of_column": show_datatype_of_column,
    "describe_column": describe_column,
    "translate_pd_to_human": translate_pd_to_human,
}

tools = [
    {
        "type": "function",
        "function": {
            "name": "describe_dataframe",
            "description": "Describe the contents of a pandas DataFrame.",
            "parameters": {"type": "object", "properties": {}, "required": []}
        }
    },
    {
        "type": "function",
        "function": {
            "name": "list_column_names_of_dataframe",
            "description": "List column names of a pandas DataFrame.",
            "parameters": {"type": "object", "properties": {}, "required": []}
        }
    },
  {
      "type": "function",
      "function": {
          "name": "show_datatype_of_column",
          "description": "Show the datatype of a column in a pandas DataFrame.",
          "parameters": {
              "type": "object",
              "properties": {
                  "column_name": {
                      "type": "string",
                      "description": "The name of the column to show the datatype of."
                  }
              },
              "required": ["column_name"]
          }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "describe_column",
            "description": "Describe the contents of a column in a pandas DataFrame.",
            "parameters": {
                "type": "object",
                "properties": {
                    "column_name": {
                        "type": "string",
                        "description": "The name of the column to describe."
                    }
                },
                "required": ["column_name"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "terminate",
            "description": "Terminates the conversation. No further actions or interactions are possible after this. Prints the provided message for the user.",
            "parameters": {
                "type": "object",
                "properties": {
                    "message": {"type": "string"},
                },
                "required": ["message"]
            }
        }
    }
]

agent_rules = [{
    "role": "system",
    "content": """
You are an AI agent that can perform tasks by using available tools.

If a user asks questions about the characteristics of a dataframe, its columns or its data,
execute the right pandas tool to obtain these results. This may require repeated function calls.
When the desired results are obtained, provide a human-interpretable summary of the results using the "terminate" tool.
"""
}]

# Initialize agent parameters
iterations = 0
max_iterations = 10

user_task = input("What would you like me to do? ")

memory = [{"role": "user", "content": user_task}]

df = pd.read_csv('sample_data/california_housing_train.csv')

# The Agent Loop
while iterations < max_iterations:
    iterations += 1
    print(f"Iteration {iterations}")

    messages = agent_rules + memory

    response = completion(
        model="openai/gpt-4o",
        messages=messages,
        tools=tools,
        max_tokens=1024
    )

    if response.choices[0].message.tool_calls:
        tool = response.choices[0].message.tool_calls[0]
        tool_name = tool.function.name
        tool_args = json.loads(tool.function.arguments)

        action = {
            "tool_name": tool_name,
            "args": tool_args
        }

        if tool_name == "terminate":
            print(f"Termination message: {tool_args['message']}")
            break
        elif tool_name in tool_functions:
            try:
                result = {"result": tool_functions[tool_name](**tool_args)}
            except Exception as e:
                result = {"error":f"Error executing {tool_name}: {str(e)}"}
        else:
            result = {"error": f"Unknown tool: {tool_name}"}

        print(f"    int. message.Executing: {tool_name} with args {tool_args}")
        print(f"    int. message. Result: {result}")
        memory.extend([
            {"role": "assistant", "content": json.dumps(action)},
            {"role": "user", "content": json.dumps(result)}
        ])
    else:
        result = response.choices[0].message.content
        print(f"Response: {result}")
        break

What would you like me to do? find the data type of the column with the largest mean
Iteration 1
    int. message.Executing: describe_dataframe with args {}
    int. message. Result: {'result': '          longitude      latitude  housing_median_age   total_rooms  total_bedrooms    population    households  median_income  median_house_value\ncount  17000.000000  17000.000000        17000.000000  17000.000000    17000.000000  17000.000000  17000.000000   17000.000000        17000.000000\nmean    -119.562108     35.625225           28.589353   2643.664412      539.410824   1429.573941    501.221941       3.883578       207300.912353\nstd        2.005166      2.137340           12.586937   2179.947071      421.499452   1147.852959    384.520841       1.908157       115983.764387\nmin     -124.350000     32.540000            1.000000      2.000000        1.000000      3.000000      1.000000       0.499900        14999.000000\n25%     -121.790000     33.930000           18.000000   1462.0000

[{'role': 'user', 'content': 'describe my dataframe'},
 {'role': 'assistant',
  'content': '{"tool_name": "describe_dataframe", "args": {}}'},
 {'role': 'user',
  'content': '{"result": "          longitude      latitude  housing_median_age   total_rooms  total_bedrooms    population    households  median_income  median_house_value\\ncount  17000.000000  17000.000000        17000.000000  17000.000000    17000.000000  17000.000000  17000.000000   17000.000000        17000.000000\\nmean    -119.562108     35.625225           28.589353   2643.664412      539.410824   1429.573941    501.221941       3.883578       207300.912353\\nstd        2.005166      2.137340           12.586937   2179.947071      421.499452   1147.852959    384.520841       1.908157       115983.764387\\nmin     -124.350000     32.540000            1.000000      2.000000        1.000000      3.000000      1.000000       0.499900        14999.000000\\n25%     -121.790000     33.930000           18.000000   1462.000000 