# Incident AI Assistant.

**Problem** Site Reliability Engineers (SREs) and DevOps teams waste hours manually investigating **production incidents**. They search through logs, telemetry, and historical incidents to find similar problems and resolutions. This delays fixes, causes downtime, and leads to revenue loss.

**Solution** :  Use **BigQuery Vector Search** to **semantically search historical incidents** and telemetry patterns. * When a new incident happens, instantly fetch **similar past issues** + their fixes. * Use **Gemini** to **summarize possible root causes** + generate a **step-by-step resolution plan**.



In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Dataset Preparation.

As not much 

In [None]:
# SRE Incident & Telemetry Data Generator
# Generate realistic production incident data with correlated telemetry patterns
# Perfect for BigQuery AI hackathon - semantic search + incident resolution

import pandas as pd
import numpy as np
import json
from datetime import datetime, timedelta
import random
import uuid
from typing import List, Dict, Tuple
import os

# =============================================================================
# INCIDENT TEMPLATES - Based on Real Production Scenarios
# =============================================================================

INCIDENT_TEMPLATES = {
    "database_connection_pool": {
        "title": "Database Connection Pool Exhaustion",
        "description": "Application experiencing timeouts due to database connection pool being exhausted. Users unable to complete transactions.",
        "technologies": ["PostgreSQL", "Java", "Spring Boot", "Connection Pool"],
        "symptoms": ["Connection timeouts", "HTTP 500 errors", "Slow response times"],
        "root_cause": "High traffic spike overwhelmed connection pool settings",
        "resolution_steps": [
            "Identified connection pool exhaustion in application logs",
            "Temporarily increased max_connections in database",
            "Restarted application servers to clear stale connections", 
            "Updated connection pool configuration to handle traffic spikes",
            "Implemented connection pool monitoring alerts"
        ],
        "impact_level": "Critical",
        "affected_services": ["payment-api", "user-service", "web-app"],
        "telemetry_patterns": {
            "pre_incident": {"cpu": (40, 60), "memory": (60, 70), "connections": (80, 95)},
            "during_incident": {"cpu": (20, 30), "memory": (70, 80), "connections": (98, 100)},
            "resolution": {"cpu": (45, 55), "memory": (65, 75), "connections": (30, 50)}
        }
    },
    
    "memory_leak": {
        "title": "Memory Leak in Payment Service",
        "description": "Payment service consuming increasing memory over time, leading to OOM kills and service restarts.",
        "technologies": ["Node.js", "Kubernetes", "Docker", "MongoDB"],
        "symptoms": ["High memory usage", "Frequent pod restarts", "Payment failures"],
        "root_cause": "Memory leak in payment processing logic - objects not being garbage collected",
        "resolution_steps": [
            "Analyzed heap dumps to identify memory leak source",
            "Found unclosed database connections in payment handler",
            "Applied hotfix to properly close connections",
            "Deployed updated service with memory limits",
            "Added memory usage monitoring and alerting"
        ],
        "impact_level": "High",
        "affected_services": ["payment-service", "order-service"],
        "telemetry_patterns": {
            "pre_incident": {"cpu": (30, 50), "memory": (50, 70), "restarts": (0, 1)},
            "during_incident": {"cpu": (60, 80), "memory": (90, 100), "restarts": (5, 15)},
            "resolution": {"cpu": (35, 45), "memory": (40, 60), "restarts": (0, 1)}
        }
    },
    
    "disk_space_full": {
        "title": "Disk Space Exhaustion on Log Server",
        "description": "Log aggregation server running out of disk space, causing log ingestion failures and service degradation.",
        "technologies": ["Elasticsearch", "Logstash", "Kibana", "Linux"],
        "symptoms": ["Log ingestion failures", "Disk I/O errors", "Search queries failing"],
        "root_cause": "Log retention policy not properly configured, old logs accumulating",
        "resolution_steps": [
            "Identified full disk partitions on log servers",
            "Cleaned up old log indices to free immediate space",
            "Implemented automated log retention policies",
            "Added disk usage monitoring and alerts",
            "Scaled storage capacity for log infrastructure"
        ],
        "impact_level": "Medium",
        "affected_services": ["logging-service", "monitoring-stack", "search-api"],
        "telemetry_patterns": {
            "pre_incident": {"cpu": (20, 40), "disk_usage": (70, 85), "io_wait": (5, 10)},
            "during_incident": {"cpu": (80, 100), "disk_usage": (95, 100), "io_wait": (30, 60)},
            "resolution": {"cpu": (25, 35), "disk_usage": (40, 60), "io_wait": (2, 8)}
        }
    },
    
    "api_rate_limit": {
        "title": "Third-party API Rate Limiting",
        "description": "External payment gateway implementing rate limits, causing transaction failures during peak hours.",
        "technologies": ["REST API", "Payment Gateway", "Redis", "Rate Limiting"],
        "symptoms": ["Payment failures", "HTTP 429 errors", "Customer complaints"],
        "root_cause": "Payment provider implemented new rate limits without notification",
        "resolution_steps": [
            "Identified HTTP 429 responses from payment provider",
            "Implemented exponential backoff retry logic",
            "Added Redis-based request queuing system",
            "Configured circuit breaker for payment failures",
            "Set up monitoring for external API response codes"
        ],
        "impact_level": "High",
        "affected_services": ["payment-gateway", "checkout-service", "billing-api"],
        "telemetry_patterns": {
            "pre_incident": {"response_time": (200, 300), "error_rate": (0.1, 0.5), "success_rate": (99.0, 99.8)},
            "during_incident": {"response_time": (2000, 5000), "error_rate": (15, 30), "success_rate": (70, 85)},
            "resolution": {"response_time": (250, 400), "error_rate": (0.2, 1.0), "success_rate": (98.5, 99.5)}
        }
    },
    
    "cpu_spike": {
        "title": "CPU Spike Due to Inefficient Query",
        "description": "Database experiencing high CPU usage due to poorly optimized query causing performance degradation across all services.",
        "technologies": ["MySQL", "Database", "Query Optimization", "Indexing"],
        "symptoms": ["High database CPU", "Slow query responses", "Application timeouts"],
        "root_cause": "New feature deployed with unoptimized database query missing proper indexes",
        "resolution_steps": [
            "Identified slow queries using database performance tools",
            "Found missing index on frequently queried table",
            "Created appropriate database indexes",
            "Optimized query execution plan",
            "Implemented query performance monitoring"
        ],
        "impact_level": "Medium",
        "affected_services": ["user-service", "product-catalog", "search-api"],
        "telemetry_patterns": {
            "pre_incident": {"cpu": (30, 50), "query_time": (50, 100), "active_connections": (20, 40)},
            "during_incident": {"cpu": (90, 100), "query_time": (2000, 8000), "active_connections": (80, 100)},
            "resolution": {"cpu": (35, 55), "query_time": (60, 120), "active_connections": (25, 45)}
        }
    }
}

# =============================================================================
# DATA GENERATION CLASSES
# =============================================================================

class TelemetryGenerator:
    """Generate realistic telemetry data patterns for incidents"""
    
    def __init__(self):
        self.metric_types = {
            "infrastructure": ["cpu_usage", "memory_usage", "disk_usage", "network_io", "load_average"],
            "application": ["response_time", "error_rate", "throughput", "active_connections", "queue_depth"],
            "business": ["transactions_per_minute", "success_rate", "revenue_per_hour", "active_users"]
        }
    
    def generate_metric_timeline(self, incident_time: datetime, template: Dict) -> List[Dict]:
        """Generate telemetry data around an incident with realistic patterns"""
        
        timeline = []
        telemetry_patterns = template["telemetry_patterns"]
        
        # Generate data points every minute for 4 hours around incident
        start_time = incident_time - timedelta(hours=2)
        end_time = incident_time + timedelta(hours=2)
        
        current_time = start_time
        while current_time <= end_time:
            
            # Determine which phase we're in
            if current_time < incident_time - timedelta(minutes=30):
                phase = "pre_incident"
            elif current_time < incident_time + timedelta(minutes=45):
                phase = "during_incident"
            else:
                phase = "resolution"
            
            # Generate metrics for this timestamp
            for service in template["affected_services"]:
                for metric_category in self.metric_types:
                    for metric_name in self.metric_types[metric_category]:
                        
                        # Get expected range for this metric in this phase
                        if metric_name in telemetry_patterns[phase]:
                            min_val, max_val = telemetry_patterns[phase][metric_name]
                        else:
                            # Default ranges for metrics not specified in template
                            min_val, max_val = self._get_default_range(metric_name, phase)
                        
                        # Add some noise and trends
                        value = random.uniform(min_val, max_val)
                        value = max(0, value + random.gauss(0, (max_val - min_val) * 0.1))
                        
                        timeline.append({
                            "timestamp": current_time,
                            "service_name": service,
                            "metric_category": metric_category,
                            "metric_name": metric_name,
                            "metric_value": round(value, 2),
                            "incident_id": None,  # Will be set later
                            "phase": phase
                        })
            
            current_time += timedelta(minutes=1)
        
        return timeline
    
    def _get_default_range(self, metric_name: str, phase: str) -> Tuple[float, float]:
        """Get default metric ranges for common metrics"""
        
        defaults = {
            "pre_incident": {
                "cpu_usage": (20, 60), "memory_usage": (40, 70), "disk_usage": (30, 70),
                "response_time": (100, 500), "error_rate": (0.1, 2.0), "throughput": (100, 500),
                "success_rate": (95, 99.5), "active_users": (1000, 5000)
            },
            "during_incident": {
                "cpu_usage": (70, 100), "memory_usage": (80, 100), "disk_usage": (60, 95),
                "response_time": (1000, 10000), "error_rate": (5, 25), "throughput": (10, 100),
                "success_rate": (60, 90), "active_users": (500, 2000)
            },
            "resolution": {
                "cpu_usage": (25, 65), "memory_usage": (45, 75), "disk_usage": (35, 75),
                "response_time": (150, 600), "error_rate": (0.5, 3.0), "throughput": (120, 600),
                "success_rate": (92, 99), "active_users": (800, 4500)
            }
        }
        
        return defaults.get(phase, {}).get(metric_name, (10, 90))

class IncidentGenerator:
    """Generate realistic incident records with resolutions"""
    
    def __init__(self):
        self.telemetry_gen = TelemetryGenerator()
    
    def generate_incidents(self, num_incidents: int = 100) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """Generate a dataset of incidents with corresponding telemetry"""
        
        incidents_data = []
        telemetry_data = []
        
        # Generate incidents over the past 6 months
        end_date = datetime.now()
        start_date = end_date - timedelta(days=180)
        
        for i in range(num_incidents):
            # Random incident time in the past 6 months
            incident_time = start_date + timedelta(
                seconds=random.randint(0, int((end_date - start_date).total_seconds()))
            )
            
            # Choose random incident template
            template_name = random.choice(list(INCIDENT_TEMPLATES.keys()))
            template = INCIDENT_TEMPLATES[template_name].copy()
            
            # Generate unique incident ID
            incident_id = f"INC-{str(uuid.uuid4())[:8].upper()}"
            
            # Add some variation to the template
            severity_levels = ["Critical", "High", "Medium", "Low"]
            template["impact_level"] = random.choice(severity_levels)
            
            # Calculate resolution time based on severity
            resolution_times = {"Critical": (15, 120), "High": (30, 240), "Medium": (60, 480), "Low": (120, 1440)}
            min_res, max_res = resolution_times[template["impact_level"]]
            resolution_time = incident_time + timedelta(minutes=random.randint(min_res, max_res))
            
            # Create incident record
            incident_record = {
                "incident_id": incident_id,
                "title": template["title"],
                "description": template["description"],
                "impact_level": template["impact_level"],
                "technologies": json.dumps(template["technologies"]),
                "affected_services": json.dumps(template["affected_services"]),
                "symptoms": json.dumps(template["symptoms"]),
                "root_cause": template["root_cause"],
                "resolution_steps": json.dumps(template["resolution_steps"]),
                "incident_time": incident_time,
                "resolution_time": resolution_time,
                "duration_minutes": int((resolution_time - incident_time).total_seconds() / 60),
                "template_type": template_name
            }
            
            incidents_data.append(incident_record)
            
            # Generate corresponding telemetry data
            telemetry_timeline = self.telemetry_gen.generate_metric_timeline(incident_time, template)
            
            # Link telemetry to incident
            for telemetry_point in telemetry_timeline:
                telemetry_point["incident_id"] = incident_id
                telemetry_data.append(telemetry_point)
            
            if (i + 1) % 20 == 0:
                print(f"Generated {i + 1}/{num_incidents} incidents...")
        
        # Convert to DataFrames
        incidents_df = pd.DataFrame(incidents_data)
        telemetry_df = pd.DataFrame(telemetry_data)
        
        return incidents_df, telemetry_df

# =============================================================================
# EXPORT AND VALIDATION FUNCTIONS
# =============================================================================

def save_datasets(incidents_df: pd.DataFrame, telemetry_df: pd.DataFrame, output_dir: str = "sre_dataset"):
    """Save generated datasets to CSV files"""
    
    os.makedirs(output_dir, exist_ok=True)
    
    # Save incidents
    incidents_path = f"{output_dir}/incidents.csv"
    incidents_df.to_csv(incidents_path, index=False)
    print(f"✅ Saved {len(incidents_df)} incidents to {incidents_path}")
    
    # Save telemetry (might be large, so save in chunks)
    telemetry_path = f"{output_dir}/telemetry.csv"
    telemetry_df.to_csv(telemetry_path, index=False)
    print(f"✅ Saved {len(telemetry_df)} telemetry points to {telemetry_path}")
    
    # Create data dictionary
    data_dict = {
        "dataset_info": {
            "name": "SRE Incident Intelligence Dataset",
            "version": "1.0",
            "created": datetime.now().isoformat(),
            "description": "Synthetic but realistic SRE incident data with correlated telemetry for AI/ML training"
        },
        "incidents_schema": {
            "incident_id": "Unique identifier for incident",
            "title": "Brief incident title",
            "description": "Detailed incident description",
            "impact_level": "Critical, High, Medium, Low",
            "technologies": "JSON array of involved technologies",
            "affected_services": "JSON array of impacted services",
            "symptoms": "JSON array of observed symptoms",
            "root_cause": "Root cause analysis",
            "resolution_steps": "JSON array of resolution steps",
            "incident_time": "When incident started",
            "resolution_time": "When incident was resolved",
            "duration_minutes": "Total incident duration",
            "template_type": "Which incident pattern this follows"
        },
        "telemetry_schema": {
            "timestamp": "Metric collection time",
            "service_name": "Name of service being monitored",
            "metric_category": "infrastructure, application, or business",
            "metric_name": "Specific metric name",
            "metric_value": "Metric value",
            "incident_id": "Related incident ID",
            "phase": "pre_incident, during_incident, or resolution"
        }
    }
    
    dict_path = f"{output_dir}/data_dictionary.json"
    with open(dict_path, 'w') as f:
        json.dump(data_dict, f, indent=2, default=str)
    print(f"✅ Saved data dictionary to {dict_path}")

def validate_dataset(incidents_df: pd.DataFrame, telemetry_df: pd.DataFrame):
    """Validate the generated dataset quality"""
    
    print("\n📊 DATASET VALIDATION REPORT")
    print("=" * 50)
    
    # Incidents validation
    print(f"📋 INCIDENTS:")
    print(f"  Total incidents: {len(incidents_df)}")
    print(f"  Incident types: {incidents_df['template_type'].nunique()}")
    print(f"  Severity distribution:")
    for severity, count in incidents_df['impact_level'].value_counts().items():
        print(f"    {severity}: {count}")
    
    print(f"  Average resolution time: {incidents_df['duration_minutes'].mean():.1f} minutes")
    
    # Telemetry validation  
    print(f"\n📊 TELEMETRY:")
    print(f"  Total data points: {len(telemetry_df)}")
    print(f"  Services monitored: {telemetry_df['service_name'].nunique()}")
    print(f"  Metric types: {telemetry_df['metric_name'].nunique()}")
    print(f"  Time range: {telemetry_df['timestamp'].min()} to {telemetry_df['timestamp'].max()}")
    
    # Relationship validation
    incident_ids_in_telemetry = telemetry_df['incident_id'].nunique()
    incident_ids_total = incidents_df['incident_id'].nunique()
    
    print(f"\n🔗 RELATIONSHIPS:")
    print(f"  Incidents with telemetry: {incident_ids_in_telemetry}/{incident_ids_total}")
    print(f"  Avg telemetry points per incident: {len(telemetry_df) / incident_ids_total:.0f}")

# =============================================================================
# MAIN GENERATION FUNCTION
# =============================================================================

def generate_complete_dataset(num_incidents: int = 100):
    """Generate complete SRE incident dataset"""
    
    print("🚀 SRE INCIDENT DATASET GENERATOR")
    print("=" * 50)
    print(f"Generating {num_incidents} realistic incidents with telemetry...")
    
    # Generate data
    generator = IncidentGenerator()
    incidents_df, telemetry_df = generator.generate_incidents(num_incidents)
    
    # Validate data
    validate_dataset(incidents_df, telemetry_df)
    
    # Save data
    save_datasets(incidents_df, telemetry_df)
    
    print("\n✅ DATASET GENERATION COMPLETE!")
    print("\n🎯 READY FOR BIGQUERY:")
    print("1. Upload incidents.csv and telemetry.csv to BigQuery")
    print("2. Generate embeddings for incident descriptions")
    print("3. Implement vector search for similar incidents")
    print("4. Use Gemini to generate resolution recommendations")
    
    return incidents_df, telemetry_df

if __name__ == "__main__":
    # Generate dataset with 150 incidents (good size for demo)
    incidents, telemetry = generate_complete_dataset(150)