<a href="https://colab.research.google.com/github/DonErnesto/data-agent/blob/master/notebooks/01_experiment_simple_agent_loop.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!!pip install litellm

# Important!!!
#
# <---- Set your 'OPENAI_API_KEY' as a secret over there with the "key" icon
#
#
import os
import pandas as pd
from google.colab import userdata
api_key = userdata.get('OPENAI_API_KEY')
os.environ['OPENAI_API_KEY'] = api_key

In [10]:
# --- Colab Bootstrap for data-agent ---

# 1. Clone your repo (if not already cloned)
!rm -rf data-agent   # remove old copy if it exists

# Get GitHub token from Colab secrets
import os
from google.colab import userdata
github_token = userdata.get('GITHUB_TOKEN')

if github_token:
    # Use the token for authentication
    !git clone https://{github_token}@github.com/DonErnesto/data-agent.git
    %cd data-agent
    print("✅ Repository cloned successfully")

    # 2. Install dependencies (colab-specific)
    !pip install -r requirements-colab.txt
    print("✅ Dependencies installed")

    # 3. Add src/ to sys.path so you can import your agent code
    # Based on the directory listing, src/ does not exist, so we add the current directory
    import sys
    sys.path.append(".")
    print("✅ Current directory added to sys.path")


    # 4. (Optional) Set API key securely in Colab
    api_key = userdata.get("OPENAI_API_KEY")
    if api_key:
        os.environ["OPENAI_API_KEY"] = api_key
        print("✅ OPENAI_API_KEY loaded")
    else:
        print("⚠️ OPENAI_API_KEY not found in Colab secrets")

    # 5. Test import
    try:
        from agent import Agent
        print("✅ Agent imported successfully")
        print("✅ Repo bootstrap complete. Example:", Agent().step())
    except ModuleNotFoundError as e:
        print(f"❌ ModuleNotFoundError: {e}. Ensure 'agent.py' exists in the cloned repository.")
else:
    print("❌ GITHUB_TOKEN not found in Colab secrets. Please add it to clone the repository.")

Cloning into 'data-agent'...
remote: Enumerating objects: 19, done.[K
remote: Counting objects: 100% (19/19), done.[K
remote: Compressing objects: 100% (14/14), done.[K
remote: Total 19 (delta 2), reused 15 (delta 1), pack-reused 0 (from 0)[K
Receiving objects: 100% (19/19), 6.21 KiB | 6.21 MiB/s, done.
Resolving deltas: 100% (2/2), done.
/content/data-agent/data-agent/data-agent
✅ Repository cloned successfully
✅ Dependencies installed
✅ Current directory added to sys.path
✅ OPENAI_API_KEY loaded
❌ ModuleNotFoundError: No module named 'agent'. Ensure 'agent.py' exists in the cloned repository.


In [11]:
        from agent import Agent
        print("✅ Agent imported successfully")
        print("✅ Repo bootstrap complete. Example:", Agent().step())

ModuleNotFoundError: No module named 'agent'

In [3]:
## Concrete use case: describing data interactively

"""
Simple loop:
- User asks a question about the data
- Agent returns the argument a pandas function to answer that question
- User returns the pandas results
- Agent describes the data and terminates.

TO-DO's:
+ we don't want to see the results from pandas. We want to see how the LLM describes them

"""

import json
import os
from typing import List

from litellm import completion

def describe_dataframe() -> str:
    """Describe the contents of a pandas DataFrame."""
    return df.describe().to_string()

def list_column_names_of_dataframe() -> List[str]:
    """List column names of a pandas DataFrame."""
    return list(df.columns)

def show_datatype_of_column(column_name: str) -> str:
    """Show the datatype of a column in a pandas DataFrame."""
    return str(df[column_name].dtype)

def describe_column(column_name: str) -> str:
    """Describe the contents of a column in a pandas DataFrame."""
    return df[column_name].describe().to_string()

def translate_pd_to_human(message) -> None:
    """ Translate the pandas results into a human-readable text.
    This will terminate the loop.
    """
    print(f"The pandas results can be described as follows: {message}")


tool_functions = {
    "list_column_names_of_dataframe": list_column_names_of_dataframe,
    "describe_dataframe": describe_dataframe,
    "show_datatype_of_column": show_datatype_of_column,
    "describe_column": describe_column,
    "translate_pd_to_human": translate_pd_to_human,
}

tools = [
    {
        "type": "function",
        "function": {
            "name": "describe_dataframe",
            "description": "Describe the contents of a pandas DataFrame.",
            "parameters": {"type": "object", "properties": {}, "required": []}
        }
    },
    {
        "type": "function",
        "function": {
            "name": "list_column_names_of_dataframe",
            "description": "List column names of a pandas DataFrame.",
            "parameters": {"type": "object", "properties": {}, "required": []}
        }
    },
  {
      "type": "function",
      "function": {
          "name": "show_datatype_of_column",
          "description": "Show the datatype of a column in a pandas DataFrame.",
          "parameters": {
              "type": "object",
              "properties": {
                  "column_name": {
                      "type": "string",
                      "description": "The name of the column to show the datatype of."
                  }
              },
              "required": ["column_name"]
          }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "describe_column",
            "description": "Describe the contents of a column in a pandas DataFrame.",
            "parameters": {
                "type": "object",
                "properties": {
                    "column_name": {
                        "type": "string",
                        "description": "The name of the column to describe."
                    }
                },
                "required": ["column_name"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "terminate",
            "description": "Terminates the conversation. No further actions or interactions are possible after this. Prints the provided message for the user.",
            "parameters": {
                "type": "object",
                "properties": {
                    "message": {"type": "string"},
                },
                "required": ["message"]
            }
        }
    }
]

agent_rules = [{
    "role": "system",
    "content": """
You are an AI agent that can perform tasks by using available tools.

If a user asks questions about the characteristics of a dataframe, its columns or its data,
execute the right pandas tool to obtain these results. This may require repeated function calls.
When the desired results are obtained, provide a human-interpretable summary of the results using the "terminate" tool.
"""
}]

# Initialize agent parameters
iterations = 0
max_iterations = 10

user_task = input("What would you like me to do? ")

memory = [{"role": "user", "content": user_task}]

df = pd.read_csv('sample_data/california_housing_train.csv')

# The Agent Loop
while iterations < max_iterations:
    iterations += 1
    print(f"Iteration {iterations}")

    messages = agent_rules + memory

    response = completion(
        model="openai/gpt-4o",
        messages=messages,
        tools=tools,
        max_tokens=1024
    )

    if response.choices[0].message.tool_calls:
        tool = response.choices[0].message.tool_calls[0]
        tool_name = tool.function.name
        tool_args = json.loads(tool.function.arguments)

        action = {
            "tool_name": tool_name,
            "args": tool_args
        }

        if tool_name == "terminate":
            print(f"Termination message: {tool_args['message']}")
            break
        elif tool_name in tool_functions:
            try:
                result = {"result": tool_functions[tool_name](**tool_args)}
            except Exception as e:
                result = {"error":f"Error executing {tool_name}: {str(e)}"}
        else:
            result = {"error": f"Unknown tool: {tool_name}"}

        print(f"    int. message.Executing: {tool_name} with args {tool_args}")
        print(f"    int. message. Result: {result}")
        memory.extend([
            {"role": "assistant", "content": json.dumps(action)},
            {"role": "user", "content": json.dumps(result)}
        ])
    else:
        result = response.choices[0].message.content
        print(f"Response: {result}")
        break

What would you like me to do? find the data type of the column with the largest mean
Iteration 1
    int. message.Executing: describe_dataframe with args {}
    int. message. Result: {'result': '          longitude      latitude  housing_median_age   total_rooms  total_bedrooms    population    households  median_income  median_house_value\ncount  17000.000000  17000.000000        17000.000000  17000.000000    17000.000000  17000.000000  17000.000000   17000.000000        17000.000000\nmean    -119.562108     35.625225           28.589353   2643.664412      539.410824   1429.573941    501.221941       3.883578       207300.912353\nstd        2.005166      2.137340           12.586937   2179.947071      421.499452   1147.852959    384.520841       1.908157       115983.764387\nmin     -124.350000     32.540000            1.000000      2.000000        1.000000      3.000000      1.000000       0.499900        14999.000000\n25%     -121.790000     33.930000           18.000000   1462.0000

In [None]:
memory

[{'role': 'user', 'content': 'describe my dataframe'},
 {'role': 'assistant',
  'content': '{"tool_name": "describe_dataframe", "args": {}}'},
 {'role': 'user',
  'content': '{"result": "          longitude      latitude  housing_median_age   total_rooms  total_bedrooms    population    households  median_income  median_house_value\\ncount  17000.000000  17000.000000        17000.000000  17000.000000    17000.000000  17000.000000  17000.000000   17000.000000        17000.000000\\nmean    -119.562108     35.625225           28.589353   2643.664412      539.410824   1429.573941    501.221941       3.883578       207300.912353\\nstd        2.005166      2.137340           12.586937   2179.947071      421.499452   1147.852959    384.520841       1.908157       115983.764387\\nmin     -124.350000     32.540000            1.000000      2.000000        1.000000      3.000000      1.000000       0.499900        14999.000000\\n25%     -121.790000     33.930000           18.000000   1462.000000 