In [1]:
import os
import nest_asyncio
from dotenv import load_dotenv
import asyncio
from google.adk.agents import Agent, SequentialAgent, ParallelAgent, LoopAgent
from google.adk.models.google_llm import Gemini
from google.adk.runners import InMemoryRunner
from google.adk.tools import AgentTool, FunctionTool, google_search
from google.genai import types

from google.adk.tools.mcp_tool.mcp_toolset import McpToolset
from google.adk.tools.tool_context import ToolContext
from google.adk.tools.mcp_tool.mcp_session_manager import StdioConnectionParams
from mcp import StdioServerParameters

from google.adk.apps.app import App, ResumabilityConfig
from google.adk.tools.function_tool import FunctionTool

print("✅ ADK components imported successfully.")

✅ ADK components imported successfully.


In [2]:
load_dotenv()
os.environ["GOOGLE_API_KEY"] = os.getenv('GOOGLE_API_KEY')
print("✅ Gemini API key setup complete.")

✅ Gemini API key setup complete.


In [3]:
retry_config=types.HttpRetryOptions(
    attempts=5,  # Maximum retry attempts
    exp_base=7,  # Delay multiplier
    initial_delay=1,
    http_status_codes=[429, 500, 503, 504], # Retry on these HTTP errors
)
print("✅ HttpRetryOptions Intialized successfully.")

✅ HttpRetryOptions Intialized successfully.


In [8]:
Query = """
I want to build an end-to-end customer churn prediction system for an e-commerce company. 
The system must include data ingestion, EDA, feature engineering, baseline modeling, model evaluation, 
and a final API for predictions. 
Constraints: I have only 10 days, I'm working alone, and I can’t use paid cloud services.
Create a complete execution plan following the JSON schema.
"""

Planner_Instuction = """You are PlannerAgent. Your job is to convert any user project goal into a complete, structured execution plan.

STRICT RULES:
1. DO NOT ask the user for clarifications unless the user explicitly says “ask me questions”.
2. If any information is missing, automatically make reasonable assumptions.
3. Always output valid JSON ONLY, following the exact schema.
4. Tasks must be concrete, actionable, logically ordered, and achievable within the user’s constraints.
5. Never repeat the user’s prompt. Never add commentary.

OUTPUT SCHEMA:
{
  "project_name": "<string>",
  "summary": "<brief overview>",
  "tasks": [
    {
      "id": "<unique-id>",
      "title": "<short title>",
      "description": "<1-2 sentence description>",
      "priority": <integer 1-5>,
      "effort_hours": <number or null>,
      "dependencies": ["<task-id>", ...],
      "required_tools": ["<tool-name>", ...],
      "acceptance_criteria": "<clear measurable criteria>"
    }
  ]
}

BEHAVIOR RULES:
- Never ask the user for data sources, metrics, schemas, or definitions.
- If the user does not provide details, you MUST infer:
  • typical data schemas,
  • typical tools,
  • typical constraints,
  • typical domain assumptions.
- Plans must contain between 6 and 20 tasks max.
- Maintain accurate ordering via dependencies.
- Use realistic, compressed timelines when constraints mention short deadlines.
- Use generic open-source tools if the user prohibits paid cloud.

REFINEMENT MODE:
If the user provides feedback like “update this”, “change this part”, or “refine”, 
you MUST:
- Update the existing plan.
- Preserve task IDs where possible.
- Output updated full plan in the same JSON schema.

FAIL-SAFE MODE:
If the LLM produces invalid JSON internally:
- Regenerate silently.
- Final output must always be valid JSON.

"""
Ingestion_instruction = """
You are DataIngestionAgent. Your task is to act as a structured interface for Kaggle datasets by providing a single, complete JSON output.

***STRICT EXECUTION PROTOCOL (Follow Sequentially):***
1.  **IF** the user's request contains the word **'download'** (or a known misspelling like 'dowload'):
    a. **SKIP ALL OTHER STEPS in Protocol 2**.
    b. The agent MUST first call **'search_datasets'** to find the dataset's slugs.
    c. From the search results, extract the 'owner_slug', 'dataset_slug', and determine the single largest or most appropriate file name (e.g., 'Netflix_User_Ratings.csv' if not specified by the user).
    d. Proceed directly to call **'download_dataset(owner_slug, dataset_slug, file_name)'**.
    e. Return the final JSON with an acknowledgment/URL in the 'errors' array.

2.  **OTHERWISE (Pure Metadata Mode):**
    a. **SEARCH:** **ALWAYS** start by calling 'search_datasets' using the user query.
    b. **PROCESS:** From the search results, you MUST identify the **SINGLE MOST RELEVANT dataset** and extract its 'owner_slug' and 'dataset_slug'.
    c. **CHAINING:** You MUST sequentially call **'get_dataset_info'** and then **'list_dataset_files'** using the extracted slugs.
    d. **OUTPUT TRANSFORMATION (CRITICAL):** After all tool calls are complete, you MUST transform the raw data into the JSON OUTPUT SCHEMA.
Your job:
- Either find and gather metadata for the single most relevant dataset **OR** execute a direct download request.

ALLOWED MCP METHODS (Use these EXACT names):
1. search_datasets(search: str, sort_by: str = 'DATASET_SORT_BY_RELEVANCE') -> ApiListDatasetsResponse
2. get_dataset_info(owner_slug: str, dataset_slug: str) -> ApiDataset
3. list_dataset_files(owner_slug: str, dataset_slug: str) -> ApiListDatasetFilesResponse
4. download_dataset(owner_slug: str, dataset_slug: str, file_name: str) -> HttpRedirect

JSON OUTPUT SCHEMA (Only one dataset object in the array for this process):
{
  "datasets": [
    {
      "title": "<string>",
      "dataset_ref": "<owner_slug/dataset_slug>", 
      "description": "<string or null>",
      "license_name": "<string or null>",
      "total_bytes": <int or null>,
      "tags": ["tag1","tag2",...],
      "files": [
        {
          "filename": "<string>",
          "size_bytes": <int or null>,
          "file_ref": "<string>"
        }
      ]
    }
  ],
  "errors": []
}

RULES:
- **Output JSON only. ABSOLUTELY NO** markdown, no explanations, no commentary, and NO conversational replies.
- **For Download Requests:** Set the 'datasets' array to empty (`[]`). Add a success/failure message and any resulting URL/error from the `download_dataset` tool to the 'errors' array.
- **For Metadata Requests:** The 'datasets' array must contain the single, fully populated dataset object.
- If a tool call fails, continue the process using the available data and place the specific error message in the "errors" array.
"""

## 1. Ingestion Agent

In [10]:
os.environ['KAGGLE_USERNAME'] = "ayushvishwakarma14"
os.environ['KAGGLE_KEY'] = "8c48a80c1bea8e54c6f00a1ab1963608"

# Data Ingestion Mcp Agent
mcp_kaggle_server = McpToolset(
    connection_params=StdioConnectionParams(
        server_params=StdioServerParameters(
            command='npx',
            args=[
                '-y',
                'mcp-remote',
                'https://www.kaggle.com/mcp'
            ]
        ),
        timeout=60,
    ),
    tool_filter=[
        "search_datasets", 
        "get_dataset_info", 
        "list_dataset_files",
        "download_dataset"
    ]
)

# Data Ingestion Agent: Its job is to find the dataset and its info from kaggle mcp
ingest_agent = Agent(
    name="DataIngestion_agent",
    model=Gemini(
        model="gemini-2.5-flash",
        retry_options=retry_config
    ),
    instruction=Ingestion_instruction,
    tools=[mcp_kaggle_server],
    output_key="Dataset_files", 
)


async def run_ingestion():
    """Defines the async context for running the agent."""
    runner = InMemoryRunner(agent = ingest_agent)
    response = await runner.run_debug("""Find a Kaggle dataset about Netflix movie ratings.""")
    print(response)

await run_ingestion()

print("✅ Ingest_agent created.")


 ### Created new session: debug_session_id

User > Find a Kaggle dataset about Netflix movie ratings.


  super().__init__(


DataIngestion_agent > ```json
{
  "datasets": [
    {
      "title": "Netflix Movie Ratings",
      "dataset_ref": "evanschreiner/netflix-movie-ratings",
      "description": "**Context**\nNetflix held the Netflix Prize open competition for the best algorithm to predict user ratings for films. The grand prize was $1,000,000 and was won by BellKor's Pragmatic Chaos team. This is the dataset that was used in that competition.\n\nThe purpose of this dataset is to provide ``csv`` files that can be read directly into a Pandas DataFrame. The procedure detailing how these csv files were created can be found in [this](https://www.kaggle.com/code/evanschreiner/cleaning-netflix-data) notebook.\n\nThe ``csv`` files are in turn based on data that was first uploaded to Kaggle [here](https://www.kaggle.com/datasets/netflix-inc/netflix-prize-data).\n\n**Netflix_User_Ratings.csv**\n\nThe first line in this file contains the headers for each column, CustID, Rating, Date, and MovieID, respectively. Each