## Building Production Handling Framework

In [1]:
# Production Error Handling Framework
import asyncio
import time
import logging
import json
from datetime import datetime, timedelta
from typing import Dict, List, Any, Optional, Callable
from dataclasses import dataclass, field
from enum import Enum
import traceback
from contextlib import asynccontextmanager
import threading
from collections import defaultdict, deque

# Configure structured logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

print("ERROR HANDLING & PRODUCTION MONITORING")
print("=" * 45)
print(f"Session: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("Focus: Enterprise reliability and observability")
print()

class ErrorSeverity(Enum):
    """Error severity levels for proper escalation"""
    LOW = "low"
    MEDIUM = "medium"
    HIGH = "high"
    CRITICAL = "critical"

@dataclass
class ErrorEvent:
    """Structured error event for tracking and analysis"""
    error_id: str
    timestamp: str
    severity: ErrorSeverity
    component: str
    error_type: str
    message: str
    stack_trace: Optional[str] = None
    context: Dict[str, Any] = field(default_factory=dict)
    resolved: bool = False

class ErrorTracker:
    """Production error tracking and analysis"""
    
    def __init__(self):
        self.errors = []
        self.error_counts = defaultdict(int)
        self.recent_errors = deque(maxlen=100)
        
    def log_error(self, component: str, error: Exception, severity: ErrorSeverity = ErrorSeverity.MEDIUM, context: Dict[str, Any] = None) -> str:
        """Log error with structured tracking"""
        import uuid
        
        error_id = str(uuid.uuid4())[:8]
        if context is None:
            context = {}
            
        error_event = ErrorEvent(
            error_id=error_id,
            timestamp=datetime.now().isoformat(),
            severity=severity,
            component=component,
            error_type=type(error).__name__,
            message=str(error),
            stack_trace=traceback.format_exc(),
            context=context
        )
        
        self.errors.append(error_event)
        self.recent_errors.append(error_event)
        self.error_counts[f"{component}:{error_event.error_type}"] += 1
        
        # Log with appropriate level
        log_level = {
            ErrorSeverity.LOW: logging.INFO,
            ErrorSeverity.MEDIUM: logging.WARNING,
            ErrorSeverity.HIGH: logging.ERROR,
            ErrorSeverity.CRITICAL: logging.CRITICAL
        }[severity]
        
        logger.log(log_level, f"Error {error_id} in {component}: {error_event.message}")
        
        return error_id
    
    def get_error_summary(self) -> Dict[str, Any]:
        """Get error analytics summary"""
        total_errors = len(self.errors)
        recent_count = len([e for e in self.recent_errors if datetime.fromisoformat(e.timestamp) > datetime.now() - timedelta(hours=1)])
        
        severity_counts = defaultdict(int)
        for error in self.recent_errors:
            severity_counts[error.severity.value] += 1
            
        return {
            "total_errors": total_errors,
            "recent_errors_1h": recent_count,
            "severity_breakdown": dict(severity_counts),
            "top_error_types": dict(list(self.error_counts.items())[:5])
        }

# Initialize error tracking
error_tracker = ErrorTracker()

print("Error tracking framework initialized:")
print("Structured error logging with severity levels")
print("Error analytics and trend analysis")
print("Context capture for debugging")

ERROR HANDLING & PRODUCTION MONITORING
Session: 2025-07-28 23:43:57
Focus: Enterprise reliability and observability

Error tracking framework initialized:
Structured error logging with severity levels
Error analytics and trend analysis
Context capture for debugging


### Circuit Braker Pattern - Error Handling

In [2]:
# Circuit Breaker Implementation
class CircuitState(Enum):
    CLOSED = "closed"
    OPEN = "open" 
    HALF_OPEN = "half_open"

@dataclass
class CircuitBreakerConfig:
    """Circuit breaker configuration"""
    failure_threshold: int = 5
    recovery_timeout: float = 30.0
    success_threshold: int = 3
    timeout: float = 10.0

class CircuitBreaker:
    """Production circuit breaker for service protection"""
    
    def __init__(self, name: str, config: CircuitBreakerConfig = None):
        self.name = name
        self.config = config or CircuitBreakerConfig()
        self.state = CircuitState.CLOSED
        self.failure_count = 0
        self.success_count = 0
        self.last_failure_time = None
        self.lock = threading.Lock()
        
    def can_execute(self) -> bool:
        """Check if request can be executed"""
        with self.lock:
            if self.state == CircuitState.CLOSED:
                return True
            elif self.state == CircuitState.OPEN:
                if time.time() - self.last_failure_time >= self.config.recovery_timeout:
                    self.state = CircuitState.HALF_OPEN
                    self.success_count = 0
                    logger.info(f"Circuit breaker {self.name} transitioning to HALF_OPEN")
                    return True
                return False
            else:  # HALF_OPEN
                return True
    
    def record_success(self):
        """Record successful execution"""
        with self.lock:
            if self.state == CircuitState.HALF_OPEN:
                self.success_count += 1
                if self.success_count >= self.config.success_threshold:
                    self.state = CircuitState.CLOSED
                    self.failure_count = 0
                    logger.info(f"Circuit breaker {self.name} recovered to CLOSED")
            elif self.state == CircuitState.CLOSED:
                self.failure_count = max(0, self.failure_count - 1)
    
    def record_failure(self):
        """Record failed execution"""
        with self.lock:
            self.failure_count += 1
            self.last_failure_time = time.time()
            
            if self.state == CircuitState.HALF_OPEN:
                self.state = CircuitState.OPEN
                logger.warning(f"Circuit breaker {self.name} failed during recovery, back to OPEN")
            elif self.state == CircuitState.CLOSED and self.failure_count >= self.config.failure_threshold:
                self.state = CircuitState.OPEN
                logger.error(f"Circuit breaker {self.name} tripped to OPEN after {self.failure_count} failures")
    
    async def execute(self, func: Callable, *args, **kwargs):
        """Execute function with circuit breaker protection"""
        if not self.can_execute():
            raise Exception(f"Circuit breaker {self.name} is OPEN")
        
        try:
            result = await asyncio.wait_for(func(*args, **kwargs), timeout=self.config.timeout)
            self.record_success()
            return result
        except Exception as e:
            self.record_failure()
            error_tracker.log_error(f"circuit_breaker_{self.name}", e, ErrorSeverity.HIGH)
            raise

# Test circuit breaker
async def unreliable_service(fail_rate: float = 0.3):
    """Simulate unreliable service for testing"""
    await asyncio.sleep(0.1)
    if time.time() % 1 < fail_rate:
        raise Exception("Service temporarily unavailable")
    return "Service response"

# Initialize circuit breakers
api_circuit_breaker = CircuitBreaker("external_api")
db_circuit_breaker = CircuitBreaker("database") 

print("\nCircuit breaker pattern implemented:")
print("Automatic failure detection and recovery")
print("Configurable thresholds and timeouts")
print("State transition monitoring")


Circuit breaker pattern implemented:
Automatic failure detection and recovery
Configurable thresholds and timeouts
State transition monitoring


## Production Monitoring & Health Checks

In [3]:
# Production Monitoring System
@dataclass
class HealthStatus:
    """Component health status"""
    component: str
    status: str  # healthy, degraded, unhealthy
    last_check: str
    response_time_ms: float
    details: Dict[str, Any] = field(default_factory=dict)

@dataclass
class SystemMetrics:
    """System performance metrics"""
    timestamp: str
    requests_per_minute: int
    average_response_time: float
    error_rate: float
    active_connections: int
    memory_usage_mb: float
    cpu_usage_percent: float

class ProductionMonitor:
    """Enterprise monitoring and observability"""
    
    def __init__(self):
        self.health_checks = {}
        self.metrics_history = deque(maxlen=100)
        self.request_times = deque(maxlen=1000)
        self.request_count = 0
        self.start_time = time.time()
        
    def register_health_check(self, component: str, check_func: Callable):
        """Register component health check"""
        self.health_checks[component] = check_func
        logger.info(f"Registered health check for {component}")
    
    async def check_component_health(self, component: str) -> HealthStatus:
        """Check individual component health"""
        if component not in self.health_checks:
            return HealthStatus(
                component=component,
                status="unknown",
                last_check=datetime.now().isoformat(),
                response_time_ms=0,
                details={"error": "No health check registered"}
            )
        
        start_time = time.time()
        try:
            check_func = self.health_checks[component]
            result = await check_func()
            response_time = (time.time() - start_time) * 1000
            
            return HealthStatus(
                component=component,
                status="healthy",
                last_check=datetime.now().isoformat(),
                response_time_ms=response_time,
                details=result or {}
            )
        except Exception as e:
            response_time = (time.time() - start_time) * 1000
            error_tracker.log_error(f"health_check_{component}", e, ErrorSeverity.MEDIUM)
            
            return HealthStatus(
                component=component,
                status="unhealthy",
                last_check=datetime.now().isoformat(),
                response_time_ms=response_time,
                details={"error": str(e)}
            )
    
    async def get_system_health(self) -> Dict[str, Any]:
        """Get overall system health status"""
        component_health = {}
        overall_status = "healthy"
        
        for component in self.health_checks:
            health = await self.check_component_health(component)
            component_health[component] = health
            
            if health.status == "unhealthy":
                overall_status = "unhealthy"
            elif health.status == "degraded" and overall_status == "healthy":
                overall_status = "degraded"
        
        return {
            "overall_status": overall_status,
            "timestamp": datetime.now().isoformat(),
            "components": component_health,
            "uptime_seconds": time.time() - self.start_time
        }
    
    def record_request(self, duration_ms: float, success: bool = True):
        """Record request metrics"""
        self.request_times.append(duration_ms)
        self.request_count += 1
        
        if not success:
            error_tracker.log_error("request_processing", Exception("Request failed"), ErrorSeverity.LOW)
    
    def get_current_metrics(self) -> SystemMetrics:
        """Get current system metrics"""
        now = datetime.now()
        recent_requests = [t for t in self.request_times if t > 0]  # Simple filter
        
        # Calculate metrics
        requests_per_minute = len(recent_requests) if recent_requests else 0
        avg_response_time = sum(recent_requests) / len(recent_requests) if recent_requests else 0
        
        # Get error rate from recent errors
        recent_errors = [e for e in error_tracker.recent_errors if datetime.fromisoformat(e.timestamp) > now - timedelta(minutes=1)]
        error_rate = len(recent_errors) / max(requests_per_minute, 1) if requests_per_minute > 0 else 0
        
        metrics = SystemMetrics(
            timestamp=now.isoformat(),
            requests_per_minute=requests_per_minute,
            average_response_time=avg_response_time,
            error_rate=error_rate,
            active_connections=10,  # Simulated
            memory_usage_mb=150.5,  # Simulated
            cpu_usage_percent=25.3   # Simulated
        )
        
        self.metrics_history.append(metrics)
        return metrics

# Initialize monitoring
monitor = ProductionMonitor()

# Register health checks
async def database_health_check():
    """Database connectivity health check"""
    # Simulate database check
    await asyncio.sleep(0.01)
    return {"connection_pool": "active", "query_time_ms": 15}

async def memory_health_check():
    """Memory usage health check"""
    # Simulate memory check
    return {"usage_percent": 45, "available_mb": 2048}

async def external_api_health_check():
    """External API health check"""
    # Simulate API check
    await asyncio.sleep(0.02)
    if time.time() % 10 < 1:  # Occasionally fail
        raise Exception("API timeout")
    return {"status": "operational", "latency_ms": 120}

monitor.register_health_check("database", database_health_check)
monitor.register_health_check("memory", memory_health_check)
monitor.register_health_check("external_api", external_api_health_check)

print("\nProduction monitoring system ready:")
print("Component health checks registered")
print("Metrics collection and analysis")
print("System observability dashboard")

2025-07-28 23:43:58,205 - __main__ - INFO - Registered health check for database
2025-07-28 23:43:58,206 - __main__ - INFO - Registered health check for memory
2025-07-28 23:43:58,207 - __main__ - INFO - Registered health check for external_api



Production monitoring system ready:
Component health checks registered
Metrics collection and analysis
System observability dashboard


## Rate Limiting & Resource Production

In [4]:
# Rate Limiting and Resource Protection
from collections import defaultdict

@dataclass
class RateLimitConfig:
    """Rate limiting configuration"""
    requests_per_minute: int = 60
    requests_per_hour: int = 1000
    burst_size: int = 10

class RateLimiter:
    """Token bucket rate limiter for request control"""
    
    def __init__(self, config: RateLimitConfig):
        self.config = config
        self.buckets = defaultdict(lambda: {
            'tokens': config.burst_size,
            'last_refill': time.time()
        })
        self.request_history = defaultdict(list)
    
    def _refill_bucket(self, bucket_key: str):
        """Refill token bucket based on time elapsed"""
        bucket = self.buckets[bucket_key]
        now = time.time()
        elapsed = now - bucket['last_refill']
        
        # Add tokens based on rate (tokens per second)
        tokens_to_add = elapsed * (self.config.requests_per_minute / 60.0)
        bucket['tokens'] = min(self.config.burst_size, bucket['tokens'] + tokens_to_add)
        bucket['last_refill'] = now
    
    def is_allowed(self, identifier: str) -> bool:
        """Check if request is allowed under rate limits"""
        # Check token bucket (burst protection)
        self._refill_bucket(identifier)
        bucket = self.buckets[identifier]
        
        if bucket['tokens'] < 1:
            error_tracker.log_error("rate_limiter", Exception(f"Rate limit exceeded for {identifier}"), ErrorSeverity.LOW)
            return False
        
        # Check hourly limit
        now = time.time()
        hour_ago = now - 3600
        self.request_history[identifier] = [t for t in self.request_history[identifier] if t > hour_ago]
        
        if len(self.request_history[identifier]) >= self.config.requests_per_hour:
            error_tracker.log_error("rate_limiter", Exception(f"Hourly limit exceeded for {identifier}"), ErrorSeverity.MEDIUM)
            return False
        
        # Consume token and record request
        bucket['tokens'] -= 1
        self.request_history[identifier].append(now)
        return True
    
    def get_rate_limit_status(self, identifier: str) -> Dict[str, Any]:
        """Get current rate limit status for identifier"""
        self._refill_bucket(identifier)
        bucket = self.buckets[identifier]
        
        hour_ago = time.time() - 3600
        hourly_requests = len([t for t in self.request_history[identifier] if t > hour_ago])
        
        return {
            "available_tokens": int(bucket['tokens']),
            "max_burst": self.config.burst_size,
            "hourly_requests": hourly_requests,
            "hourly_limit": self.config.requests_per_hour,
            "requests_remaining": self.config.requests_per_hour - hourly_requests
        }

class ResourceManager:
    """System resource management and protection"""
    
    def __init__(self):
        self.active_requests = 0
        self.max_concurrent_requests = 100
        self.request_queue_size = 50
        self.degraded_mode = False
        
    async def acquire_request_slot(self, priority: str = "normal") -> bool:
        """Acquire slot for request processing"""
        if self.active_requests >= self.max_concurrent_requests:
            if priority == "high":
                # High priority requests can queue briefly
                for _ in range(10):  # Wait up to 1 second
                    await asyncio.sleep(0.1)
                    if self.active_requests < self.max_concurrent_requests:
                        break
                else:
                    return False
            else:
                return False
        
        self.active_requests += 1
        return True
    
    def release_request_slot(self):
        """Release request processing slot"""
        self.active_requests = max(0, self.active_requests - 1)
    
    def check_system_load(self) -> Dict[str, Any]:
        """Check current system load and status"""
        load_percentage = (self.active_requests / self.max_concurrent_requests) * 100
        
        # Automatically enter degraded mode if overloaded
        if load_percentage > 90 and not self.degraded_mode:
            self.degraded_mode = True
            logger.warning("System entering degraded mode due to high load")
        elif load_percentage < 70 and self.degraded_mode:
            self.degraded_mode = False
            logger.info("System exiting degraded mode")
        
        return {
            "active_requests": self.active_requests,
            "max_concurrent": self.max_concurrent_requests,
            "load_percentage": load_percentage,
            "degraded_mode": self.degraded_mode,
            "status": "overloaded" if load_percentage > 90 else "normal"
        }

# Initialize protection systems
rate_limiter = RateLimiter(RateLimitConfig(requests_per_minute=100, requests_per_hour=5000))
resource_manager = ResourceManager()

print("\nResource protection systems active:")
print("Token bucket rate limiting")
print("Concurrent request management")
print("Automatic degraded mode activation")

# Test rate limiting
test_user = "user123"
print(f"\nTesting rate limiter for {test_user}:")
for i in range(5):
    allowed = rate_limiter.is_allowed(test_user)
    status = rate_limiter.get_rate_limit_status(test_user)
    print(f"   Request {i+1}: {'Allowed' if allowed else 'Blocked'} (tokens: {status['available_tokens']})")

print(f"\nRate limit status: {rate_limiter.get_rate_limit_status(test_user)}")
print(f"System load: {resource_manager.check_system_load()}")


Resource protection systems active:
Token bucket rate limiting
Concurrent request management
Automatic degraded mode activation

Testing rate limiter for user123:
   Request 1: Allowed (tokens: 9)
   Request 2: Allowed (tokens: 8)
   Request 3: Allowed (tokens: 7)
   Request 4: Allowed (tokens: 6)
   Request 5: Allowed (tokens: 5)

Rate limit status: {'available_tokens': 5, 'max_burst': 10, 'hourly_requests': 5, 'hourly_limit': 5000, 'requests_remaining': 4995}
System load: {'active_requests': 0, 'max_concurrent': 100, 'load_percentage': 0.0, 'degraded_mode': False, 'status': 'normal'}
