Data detective Agent — Kaggle 5‑Day AI Agents Intensive Capstone

Author: Md.Karaamathullah sheriff & Sri Sai Vatsan.R


In [1]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("GOOGLE_API_KEY")


![image.png](attachment:4cdd1efa-cea8-443c-9f29-2758e611559b.png)


In [3]:
from dataclasses import dataclass, asdict
from typing import List, Dict, Any
import json, os, textwrap, uuid
from pathlib import Path

In [1]:
# Install required packages
# !pip install -q google-generativeai pandas matplotlib

import os
import json
from datetime import datetime
from typing import Dict, List
# import pandas as pd 
# import matplotlib.pyplot as plt
import google.generativeai as genai 

# Configure Gemini API
# try:
#     # ... (Kaggle Secrets configuration as in your PDF)
# except:
#     print("(Note: Add GOOGLE_API_KEY for full functionality)")
#     print("This demo will use simulated responses")

print("Environment setup complete!")

# --- Agent Classes (Simulated for template) ---

class DataCleanerAgent:
    def parse_data(self, raw_data_text: str) -> Dict:
        # Simulate data parsing and cleaning
        print(f"[Data Cleaner Agent]: Analyzing raw data ({len(raw_data_text)} chars)...")
        # In a real implementation, this would use Gemini to identify and clean columns
        return {
            "cleaned_columns": ["CustomerID", "Age", "Revenue", "Country"],
            "rows": 1000,
            "status": "Cleaned (3 issues resolved)"
        }

class AnalysisAgent:
    def run_analysis(self, cleaned_data: Dict, business_question: str) -> Dict:
        # Simulate running statistical analysis
        print(f"[Analysis Agent]: Running analysis for: '{business_question}'...")
        # This would use a Code Execution tool for statistical libraries (e.g., pandas)
        return {
            "key_insight": "Age group 25-35 has 40% higher average revenue.",
            "correlation": "Revenue is positively correlated with Age (r=0.65).",
            "stat_results": "p-value for Age vs. Revenue: 0.001"
        }

class ReportAgent:
    def generate_report(self, analysis_results: Dict, cleaned_data: Dict) -> str:
        # Simulate generating the final business report
        print("[Report Agent]: Synthesizing final business report...")
        # This would use Gemini to structure the report and provide recommendations
        return f"""
        # Business Insight Report: Q3 Revenue Analysis
        ## Key Finding
        {analysis_results['key_insight']}
        
        ## Recommendation
        Target marketing campaigns at the 25-35 age bracket in key markets.

        ... (Full report)
        """

# --- Multi-Agent Orchestrator ---

class DataDetectiveOrchestrator:
    """Coordinates all agents in the data analysis workflow."""
    
    def __init__(self):
        self.cleaner = DataCleanerAgent()
        self.analyzer = AnalysisAgent()
        self.reporter = ReportAgent()
        self.execution_log = []
        print("Data Detective Orchestrator created successfully!")

    def process_data(self, raw_data_text: str, business_question: str) -> Dict:
        print("\n=== PROCESSING DATASET ===")

        # Step 1: Data Cleaning (CV Parser equivalent)
        start_time = datetime.now()
        cleaned_data = self.cleaner.parse_data(raw_data_text)
        self.execution_log.append({"step": "Cleaning", "time": (datetime.now() - start_time).total_seconds()})
        print(f"| Step 1 Complete: Data Status - {cleaned_data['status']} |")

        # Step 2: Analysis (Skill Matcher equivalent, but for data)
        start_time = datetime.now()
        analysis_result = self.analyzer.run_analysis(cleaned_data, business_question)
        self.execution_log.append({"step": "Analysis", "time": (datetime.now() - start_time).total_seconds()})
        print(f"| Step 2 Complete: Insight - {analysis_result['key_insight']} |")

        # Step 3: Report Generation (Implicit/Combined with Evaluation Agent)
        start_time = datetime.now()
        final_report = self.reporter.generate_report(analysis_result, cleaned_data)
        self.execution_log.append({"step": "Reporting", "time": (datetime.now() - start_time).total_seconds()})
        
        print("\n=== PROCESSING COMPLETE ===")
        return {
            "analysis_result": analysis_result,
            "final_report": final_report,
            "execution_log": self.execution_log,
            "timestamp": datetime.now().isoformat()
        }

# --- Demo Execution ---

# Sample Data (Simulated as a large string)
sample_data_text = """
CustomerID,Age,Revenue,Country
1,28,150.00,USA
2,45,210.50,CAN
... (thousands of rows)
"""
job_to_be_done = "Identify the most profitable customer segment and recommend a target strategy."

# Create orchestrator and process data
orchestrator = DataDetectiveOrchestrator()
result = orchestrator.process_data(sample_data_text, job_to_be_done)

# Display final result
print("\n--- RECOMMENDED BUSINESS REPORT ---")
print(result['final_report'])

print("\n\nDemo Complete! The Data Detective AI Agent successfully analyzed the data.")

Environment setup complete!
Data Detective Orchestrator created successfully!

=== PROCESSING DATASET ===
[Data Cleaner Agent]: Analyzing raw data (88 chars)...
| Step 1 Complete: Data Status - Cleaned (3 issues resolved) |
[Analysis Agent]: Running analysis for: 'Identify the most profitable customer segment and recommend a target strategy.'...
| Step 2 Complete: Insight - Age group 25-35 has 40% higher average revenue. |
[Report Agent]: Synthesizing final business report...

=== PROCESSING COMPLETE ===

--- RECOMMENDED BUSINESS REPORT ---

        # Business Insight Report: Q3 Revenue Analysis
        ## Key Finding
        Age group 25-35 has 40% higher average revenue.
        
        ## Recommendation
        Target marketing campaigns at the 25-35 age bracket in key markets.

        ... (Full report)
        


Demo Complete! The Data Detective AI Agent successfully analyzed the data.


Agent Implementations