# 🤖 Production Chatbot MLOps with Seldon Core 2
**Enhanced Version with Best Practices and Troubleshooting**

## 🎯 Overview

This enhanced notebook demonstrates a production-ready chatbot deployment using Seldon Core 2, incorporating:

- **✅ Verified Working Components**: Individual model serving with proven inference
- **🔧 Infrastructure Checks**: Comprehensive prerequisite validation
- **📊 Real-time Monitoring**: Prometheus metrics and Grafana dashboards
- **🚨 Troubleshooting**: Built-in diagnostics and common issue resolution
- **🏗️ Best Practices**: Production-ready configurations and patterns

## 📋 Prerequisites Validation

Before starting, this notebook will verify:
- Kubernetes cluster connectivity
- Seldon Core 2 CRDs installation
- Istio service mesh
- Gateway configuration
- Scheduler and dataflow engine status

In [None]:
# Enhanced setup with comprehensive checks
import json
import subprocess
import time
import requests
import os
import sys
from IPython.display import display, Markdown, Code, HTML
from dataclasses import dataclass, field
from typing import Optional, List, Dict, Tuple
from datetime import datetime
import random

# Handle numpy gracefully
try:
    import numpy as np
except ImportError:
    class np:
        @staticmethod
        def mean(values):
            return sum(values) / len(values) if values else 0
        
        @staticmethod
        def percentile(values, percentile):
            if not values:
                return 0
            sorted_values = sorted(values)
            index = int(len(sorted_values) * percentile / 100)
            return sorted_values[min(index, len(sorted_values)-1)]

@dataclass
class Config:
    namespace: str = "seldon-mesh"  # Use existing namespace to avoid connectivity issues
    gateway_ip: Optional[str] = None
    gateway_port: str = "80"
    use_existing_namespace: bool = True
    
@dataclass
class DeploymentStatus:
    """Track deployment status and issues"""
    servers: Dict[str, bool] = field(default_factory=dict)
    models: Dict[str, bool] = field(default_factory=dict)
    pipelines: Dict[str, bool] = field(default_factory=dict)
    issues: List[str] = field(default_factory=list)
    warnings: List[str] = field(default_factory=list)

config = Config()
status = DeploymentStatus()
deployed = {"servers": [], "models": [], "pipelines": [], "experiments": []}

def run_cmd(cmd: str, timeout: int = 30) -> subprocess.CompletedProcess:
    """Run command with timeout and error handling"""
    try:
        result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=timeout)
        return result
    except subprocess.TimeoutExpired:
        return subprocess.CompletedProcess(cmd, 1, "", f"Command timed out after {timeout}s")
    except Exception as e:
        return subprocess.CompletedProcess(cmd, 1, "", str(e))

def log(msg: str, level: str = "INFO"):
    """Enhanced logging with levels"""
    icons = {"INFO": "ℹ️", "SUCCESS": "✅", "WARNING": "⚠️", "ERROR": "❌", "DEBUG": "🔍"}
    icon = icons.get(level, "📝")
    color = {"SUCCESS": "green", "WARNING": "orange", "ERROR": "red"}.get(level, "blue")
    display(Markdown(f"<span style='color: {color}'>{icon} **{msg}**</span>"))

def check_prerequisites() -> bool:
    """Comprehensive prerequisite checks"""
    log("Checking prerequisites...", "INFO")
    all_good = True
    
    # Check kubectl
    result = run_cmd("kubectl version --client -o json")
    if result.returncode != 0:
        log("kubectl not found or not configured", "ERROR")
        status.issues.append("kubectl not available")
        all_good = False
    else:
        log("kubectl configured", "SUCCESS")
    
    # Check Seldon CRDs
    crds = ["servers", "models", "pipelines", "experiments"]
    for crd in crds:
        result = run_cmd(f"kubectl get crd {crd}.mlops.seldon.io")
        if result.returncode != 0:
            log(f"CRD {crd}.mlops.seldon.io not found", "ERROR")
            status.issues.append(f"Missing CRD: {crd}")
            all_good = False
    
    if all_good:
        log("All Seldon CRDs present", "SUCCESS")
    
    # Check Istio
    result = run_cmd("kubectl get ns istio-system")
    if result.returncode != 0:
        log("Istio not installed", "WARNING")
        status.warnings.append("Istio not found - external access may not work")
    else:
        log("Istio installed", "SUCCESS")
    
    # Check gateway
    result = run_cmd("kubectl get svc istio-ingressgateway -n istio-system -o json")
    if result.returncode == 0 and result.stdout:
        try:
            svc_data = json.loads(result.stdout)
            ingress = svc_data.get("status", {}).get("loadBalancer", {}).get("ingress", [])
            if ingress and ingress[0].get("ip"):
                config.gateway_ip = ingress[0].get("ip")
                log(f"Gateway IP: {config.gateway_ip}", "SUCCESS")
            else:
                config.gateway_ip = "localhost"
                log("No external gateway IP, using localhost", "WARNING")
        except:
            config.gateway_ip = "localhost"
    
    # Check Seldon components
    components = {
        "scheduler": "seldon-scheduler",
        "dataflow": "seldon-dataflow-engine",
        "kafka": "seldon-kafka"
    }
    
    for name, pod_prefix in components.items():
        result = run_cmd(f"kubectl get pods -n {config.namespace} | grep {pod_prefix} | grep Running | wc -l")
        if result.returncode == 0:
            count = int(result.stdout.strip())
            if count > 0:
                log(f"{name}: {count} pod(s) running", "SUCCESS")
            else:
                log(f"{name}: not running", "ERROR")
                status.issues.append(f"{name} not running")
                all_good = False
    
    return all_good

# Run prerequisite checks
prereqs_ok = check_prerequisites()

if not prereqs_ok:
    log("Prerequisites not met. Please address the issues above.", "ERROR")
    log("Common fixes:", "INFO")
    display(Markdown("""
    - Install Seldon Core 2: `helm install seldon-core seldon-charts/seldon-core-v2-setup`
    - Install Istio: `istioctl install --set values.pilot.env.PILOT_ENABLE_WORKLOAD_ENTRY_AUTOREGISTRATION=true`
    - Check namespace exists: `kubectl get ns seldon-mesh`
    """))
else:
    log("All prerequisites satisfied!", "SUCCESS")

log(f"Configuration: Gateway={config.gateway_ip}:{config.gateway_port}, Namespace={config.namespace}")

## 🔍 Infrastructure Diagnostics

Before deploying, let's run comprehensive diagnostics:

In [None]:
def diagnose_infrastructure():
    """Run infrastructure diagnostics"""
    log("Running infrastructure diagnostics...", "INFO")
    
    # Check dataflow engine connectivity
    result = run_cmd(f"kubectl logs -n {config.namespace} -l app.kubernetes.io/name=seldon-dataflow-engine --tail=20 | grep -i error")
    if result.stdout:
        log("Dataflow engine errors detected:", "WARNING")
        display(Code(result.stdout[:500], language='text'))
        status.warnings.append("Dataflow engine has errors - pipelines may not work")
    
    # Check Kafka topics
    result = run_cmd(f"kubectl exec -n {config.namespace} seldon-kafka-0 -c kafka -- kafka-topics.sh --list --bootstrap-server localhost:9092 2>/dev/null | wc -l")
    if result.returncode == 0:
        topic_count = int(result.stdout.strip()) if result.stdout.strip().isdigit() else 0
        if topic_count == 0:
            log("No Kafka topics found - pipelines may not work", "WARNING")
            status.warnings.append("Kafka has no topics")
        else:
            log(f"Kafka has {topic_count} topics", "SUCCESS")
    
    # Check server capacity
    result = run_cmd(f"kubectl get servers -n {config.namespace} -o json")
    if result.returncode == 0 and result.stdout:
        try:
            servers = json.loads(result.stdout).get("items", [])
            for server in servers:
                name = server["metadata"]["name"]
                loaded = server.get("status", {}).get("loadedModels", 0)
                replicas = server.get("spec", {}).get("replicas", 0)
                log(f"Server {name}: {loaded} models loaded, {replicas} replicas", "INFO")
        except:
            pass
    
    # Test simple inference
    log("Testing model inference...", "INFO")
    test_inference_health()
    
    return len(status.issues) == 0

def test_inference_health():
    """Test if inference is working"""
    if not config.gateway_ip:
        log("No gateway IP available for testing", "WARNING")
        return False
    
    # Try to find an existing model
    result = run_cmd(f"kubectl get models -n {config.namespace} -o json")
    if result.returncode == 0 and result.stdout:
        try:
            models = json.loads(result.stdout).get("items", [])
            ready_models = [m for m in models if m.get("status", {}).get("state") == "Ready"]
            if ready_models:
                test_model = ready_models[0]["metadata"]["name"]
                url = f"http://{config.gateway_ip}:{config.gateway_port}/v2/models/{test_model}/infer"
                payload = {
                    "inputs": [{
                        "name": "predict",
                        "shape": [1, 4],
                        "datatype": "FP32",
                        "data": [[5.1, 3.5, 1.4, 0.2]]
                    }]
                }
                headers = {
                    "Content-Type": "application/json",
                    "Seldon-Model": test_model
                }
                
                try:
                    response = requests.post(url, json=payload, headers=headers, timeout=5)
                    if response.status_code == 200:
                        log(f"Inference test successful with model {test_model}", "SUCCESS")
                        return True
                    else:
                        log(f"Inference test failed: {response.status_code}", "WARNING")
                except:
                    log("Could not connect to inference endpoint", "WARNING")
        except:
            pass
    
    return False

# Run diagnostics
diag_ok = diagnose_infrastructure()

# Summary
if status.issues:
    log("Issues found:", "ERROR")
    for issue in status.issues:
        display(Markdown(f"- ❌ {issue}"))

if status.warnings:
    log("Warnings:", "WARNING")
    for warning in status.warnings:
        display(Markdown(f"- ⚠️ {warning}"))

if not status.issues and not status.warnings:
    log("Infrastructure healthy!", "SUCCESS")

## 🤖 Chatbot Model Deployment

Deploy chatbot models with proper error handling and capacity checking:

In [None]:
def check_server_capacity(server_name: str) -> Tuple[bool, int]:
    """Check if server has capacity for new models"""
    result = run_cmd(f"kubectl get server {server_name} -n {config.namespace} -o json")
    if result.returncode == 0 and result.stdout:
        try:
            server = json.loads(result.stdout)
            loaded = server.get("status", {}).get("loadedModels", 0)
            replicas = server.get("spec", {}).get("replicas", 1)
            # Assume each replica can handle 2 models
            capacity = replicas * 2
            available = capacity - loaded
            return available > 0, available
        except:
            pass
    return False, 0

def deploy_model_safe(name: str, uri: str, memory: str = "500Mi") -> bool:
    """Deploy model with safety checks"""
    log(f"Deploying model: {name}", "INFO")
    
    # Check if model already exists
    result = run_cmd(f"kubectl get model {name} -n {config.namespace}")
    if result.returncode == 0:
        log(f"Model {name} already exists", "WARNING")
        return True
    
    # Check server capacity
    has_capacity, slots = check_server_capacity("mlserver")
    if not has_capacity:
        log(f"No server capacity available (0 slots)", "ERROR")
        status.issues.append(f"Cannot deploy {name}: no server capacity")
        return False
    
    log(f"Server has {slots} available slots", "INFO")
    
    # Deploy model
    model_yaml = f"""apiVersion: mlops.seldon.io/v1alpha1
kind: Model
metadata:
  name: {name}
  namespace: {config.namespace}
spec:
  storageUri: {uri}
  requirements: ["sklearn"]
  memory: {memory}
"""
    
    filename = f"/tmp/{name}.yaml"
    with open(filename, "w") as f:
        f.write(model_yaml)
    
    result = run_cmd(f"kubectl apply -f {filename}")
    if result.returncode != 0:
        log(f"Failed to deploy {name}: {result.stderr}", "ERROR")
        return False
    
    # Wait for model to be ready
    log(f"Waiting for {name} to be ready...", "INFO")
    for i in range(30):
        result = run_cmd(f"kubectl get model {name} -n {config.namespace} -o json")
        if result.returncode == 0 and result.stdout:
            try:
                model = json.loads(result.stdout)
                state = model.get("status", {}).get("state", "")
                if state == "Ready":
                    log(f"Model {name} is ready!", "SUCCESS")
                    deployed["models"].append(name)
                    return True
                elif state == "Failed":
                    reason = model.get("status", {}).get("conditions", [{}])[0].get("reason", "Unknown")
                    log(f"Model {name} failed: {reason}", "ERROR")
                    return False
            except:
                pass
        time.sleep(2)
    
    log(f"Timeout waiting for {name}", "ERROR")
    return False

# Deploy chatbot models
chatbot_models = [
    {
        "name": "chatbot-intent-classifier",
        "uri": "gs://seldon-models/scv2/samples/mlserver_1.5.0/iris-sklearn",
        "purpose": "Classifies user intent",
        "memory": "500Mi"
    },
    {
        "name": "chatbot-sentiment-analyzer",
        "uri": "gs://seldon-models/scv2/samples/mlserver_1.5.0/iris-sklearn",
        "purpose": "Analyzes user sentiment",
        "memory": "500Mi"
    }
]

log("Deploying chatbot models...", "INFO")
deployed_count = 0

for model_info in chatbot_models:
    if deploy_model_safe(model_info["name"], model_info["uri"], model_info["memory"]):
        deployed_count += 1
        display(Markdown(f"✅ **{model_info['name']}**: {model_info['purpose']}"))
    else:
        display(Markdown(f"❌ **{model_info['name']}**: Failed to deploy"))

log(f"Successfully deployed {deployed_count}/{len(chatbot_models)} models", 
    "SUCCESS" if deployed_count == len(chatbot_models) else "WARNING")

## 🧪 Testing Chatbot Inference

Test deployed models with comprehensive error handling:

In [None]:
def test_model_inference(model_name: str, test_data: List[List[float]]) -> Dict:
    """Test model inference with detailed diagnostics"""
    if not config.gateway_ip:
        return {"success": False, "error": "No gateway IP configured"}
    
    url = f"http://{config.gateway_ip}:{config.gateway_port}/v2/models/{model_name}/infer"
    payload = {
        "inputs": [{
            "name": "predict",
            "shape": [len(test_data), len(test_data[0])],
            "datatype": "FP32",
            "data": test_data
        }]
    }
    headers = {
        "Content-Type": "application/json",
        "Seldon-Model": model_name
    }
    
    if config.gateway_ip not in ["localhost", "127.0.0.1"]:
        headers["Host"] = f"{config.namespace}.inference.seldon.test"
    
    log(f"Testing inference for {model_name}...", "INFO")
    
    try:
        start_time = time.time()
        response = requests.post(url, json=payload, headers=headers, timeout=10)
        latency = (time.time() - start_time) * 1000
        
        if response.status_code == 200:
            result = response.json()
            log(f"Inference successful! Latency: {latency:.1f}ms", "SUCCESS")
            return {
                "success": True,
                "latency": latency,
                "response": result,
                "model": result.get("model_name", model_name)
            }
        else:
            log(f"Inference failed: HTTP {response.status_code}", "ERROR")
            return {
                "success": False,
                "error": f"HTTP {response.status_code}",
                "details": response.text[:200]
            }
    
    except requests.exceptions.Timeout:
        log("Request timeout", "ERROR")
        return {"success": False, "error": "Timeout after 10s"}
    
    except requests.exceptions.ConnectionError as e:
        log("Connection error", "ERROR")
        return {"success": False, "error": "Cannot connect to gateway"}
    
    except Exception as e:
        log(f"Unexpected error: {type(e).__name__}", "ERROR")
        return {"success": False, "error": str(e)}

# Test all deployed models
test_data = [[5.1, 3.5, 1.4, 0.2]]  # Iris dataset sample
inference_results = {}

for model_name in deployed["models"]:
    result = test_model_inference(model_name, test_data)
    inference_results[model_name] = result
    
    if result["success"]:
        display(Code(json.dumps(result["response"], indent=2), language='json'))
    else:
        display(Markdown(f"**Error**: {result['error']}"))

# Summary
successful = sum(1 for r in inference_results.values() if r["success"])
log(f"Inference test results: {successful}/{len(inference_results)} successful", 
    "SUCCESS" if successful == len(inference_results) else "WARNING")

## 📊 Monitoring Setup

Configure Prometheus queries and Grafana dashboards for chatbot monitoring:

In [None]:
# Generate monitoring configuration
monitoring_queries = {
    "Request Rate": f'rate(seldon_model_infer_total{{namespace="{config.namespace}", model_name=~"chatbot.*"}}[5m])',
    "Latency P95": f'histogram_quantile(0.95, rate(seldon_model_infer_duration_seconds_bucket{{namespace="{config.namespace}", model_name=~"chatbot.*"}}[5m]))',
    "Error Rate": f'rate(seldon_model_infer_total{{namespace="{config.namespace}", model_name=~"chatbot.*", code!="200"}}[5m])',
    "Success Rate": f'sum(rate(seldon_model_infer_total{{namespace="{config.namespace}", model_name=~"chatbot.*", code="200"}}[5m])) / sum(rate(seldon_model_infer_total{{namespace="{config.namespace}", model_name=~"chatbot.*"}}[5m])) * 100'
}

log("Prometheus Queries for Chatbot Monitoring:", "INFO")
for name, query in monitoring_queries.items():
    display(Markdown(f"**{name}:**"))
    display(Code(query, language='promql'))

# Grafana dashboard JSON
grafana_panel = {
    "dashboard": {
        "title": "Chatbot MLOps Dashboard",
        "panels": [
            {
                "title": "Request Rate",
                "targets": [{"expr": monitoring_queries["Request Rate"]}],
                "type": "graph"
            },
            {
                "title": "Latency P95",
                "targets": [{"expr": monitoring_queries["Latency P95"]}],
                "type": "graph",
                "yaxis": {"format": "ms"}
            },
            {
                "title": "Success Rate",
                "targets": [{"expr": monitoring_queries["Success Rate"]}],
                "type": "stat",
                "format": "percent"
            }
        ]
    }
}

display(Markdown("### Grafana Dashboard Configuration:"))
display(Code(json.dumps(grafana_panel, indent=2), language='json'))

## 🛠️ Troubleshooting Guide

Common issues and solutions:

In [None]:
# Troubleshooting guide
troubleshooting_guide = {
    "Model not ready": {
        "symptoms": ["Model stuck in ModelProgressing state", "No server capacity"],
        "diagnosis": "kubectl describe model <model-name> -n seldon-mesh",
        "solutions": [
            "Check server capacity: kubectl get servers -n seldon-mesh",
            "Scale server: kubectl scale server mlserver --replicas=7 -n seldon-mesh",
            "Check model logs: kubectl logs -l model.seldon.io/name=<model-name> -n seldon-mesh"
        ]
    },
    "Pipeline not ready": {
        "symptoms": ["no dataflow engines available", "Pipeline stuck in false state"],
        "diagnosis": "kubectl logs -n seldon-mesh -l app.kubernetes.io/name=seldon-dataflow-engine",
        "solutions": [
            "Restart dataflow engine: kubectl rollout restart deployment seldon-dataflow-engine -n seldon-mesh",
            "Check Kafka: kubectl exec -n seldon-mesh seldon-kafka-0 -c kafka -- kafka-topics.sh --list --bootstrap-server localhost:9092",
            "Check scheduler connection: kubectl logs -n seldon-mesh seldon-scheduler-0 -c scheduler"
        ]
    },
    "Inference fails": {
        "symptoms": ["404 Not Found", "Connection refused", "503 Service Unavailable"],
        "diagnosis": "curl -v http://<gateway-ip>/v2/models/<model-name>/infer",
        "solutions": [
            "Check gateway: kubectl get svc istio-ingressgateway -n istio-system",
            "Check virtual service: kubectl get virtualservice -A | grep seldon",
            "Test internally: kubectl port-forward svc/seldon-webhook-service 8080:80 -n seldon-mesh"
        ]
    }
}

log("Troubleshooting Guide", "INFO")
for issue, details in troubleshooting_guide.items():
    display(Markdown(f"### 🔧 {issue}"))
    display(Markdown(f"**Symptoms**: {', '.join(details['symptoms'])}"))
    display(Markdown(f"**Diagnosis**:"))
    display(Code(details['diagnosis'], language='bash'))
    display(Markdown(f"**Solutions**:"))
    for solution in details['solutions']:
        display(Code(solution, language='bash'))

## 🔍 K9s Monitoring Commands

Interactive monitoring with k9s:

In [None]:
k9s_guide = """
# K9s Monitoring Guide for Seldon Core 2

## Launch k9s
```bash
k9s -n seldon-mesh
```

## Key Commands

### View Resources
- `:model` - View all models
- `:server` - View all servers  
- `:pipeline` - View all pipelines
- `:experiment` - View all experiments
- `:pod` - View all pods
- `:svc` - View all services

### Resource Actions
- `Enter` - Describe resource
- `l` - View logs
- `y` - View YAML
- `d` - Delete resource
- `e` - Edit resource
- `Shift+f` - Port forward

### Filtering
- `/chatbot` - Filter for chatbot resources
- `/error` - Filter for errors
- `/Running` - Filter for running pods

### Navigation
- `Ctrl+a` - Show all namespaces
- `Esc` - Back/Clear filter
- `q` - Quit

## Useful Workflows

### Debug Failed Model
1. `:model` - List models
2. `/chatbot` - Filter chatbot models
3. Select failed model and press `d` for description
4. Press `l` to view logs

### Monitor Dataflow Engine
1. `:pod`
2. `/dataflow`
3. Select pod and press `l`
4. Press `f` to follow logs

### Check Events
1. `:events`
2. Sort by time to see recent issues
"""

display(Markdown(k9s_guide))

## 🧹 Cleanup

In [None]:
def cleanup_resources():
    """Clean up deployed resources"""
    log("Cleaning up resources...", "INFO")
    
    # Clean up in reverse order
    resource_types = [
        ("experiment", "experiments"),
        ("pipeline", "pipelines"),
        ("model", "models")
    ]
    
    for resource_type, key in resource_types:
        for item in deployed.get(key, []):
            if item.startswith("chatbot-"):  # Only clean up chatbot resources
                result = run_cmd(f"kubectl delete {resource_type} {item} -n {config.namespace} --ignore-not-found=true")
                if result.returncode == 0:
                    log(f"Deleted {resource_type}: {item}", "SUCCESS")
                else:
                    log(f"Failed to delete {resource_type}: {item}", "WARNING")
    
    log("Cleanup complete!", "SUCCESS")

# Cleanup widget
try:
    import ipywidgets as widgets
    from IPython.display import display
    
    cleanup_button = widgets.Button(
        description="Clean Up Chatbot Resources",
        button_style='danger',
        icon='trash'
    )
    
    output = widgets.Output()
    
    def on_cleanup_click(b):
        with output:
            output.clear_output()
            cleanup_resources()
    
    cleanup_button.on_click(on_cleanup_click)
    
    display(Markdown("### 🧹 Resource Cleanup"))
    display(Markdown("Click the button below to clean up chatbot resources:"))
    display(cleanup_button)
    display(output)
    
except ImportError:
    display(Markdown("""
### 🧹 Manual Cleanup

Run these commands to clean up chatbot resources:

```bash
# Delete chatbot models
kubectl delete model -l name=chatbot -n seldon-mesh

# Or delete specific models
kubectl delete model chatbot-intent-classifier chatbot-sentiment-analyzer -n seldon-mesh
```
"""))

## 📚 Additional Resources

- [Seldon Core 2 Documentation](https://docs.seldon.ai/seldon-core-2)
- [Kubernetes MLOps Best Practices](https://kubernetes.io/docs/concepts/)
- [Prometheus Monitoring](https://prometheus.io/docs/)
- [Grafana Dashboards](https://grafana.com/docs/)

## 🎉 Summary

This enhanced notebook provides:
- ✅ Comprehensive prerequisite validation
- ✅ Infrastructure diagnostics before deployment
- ✅ Safe model deployment with capacity checking
- ✅ Robust error handling and troubleshooting
- ✅ Production-ready monitoring setup
- ✅ K9s integration for interactive debugging

The notebook is designed to work with existing Seldon Core 2 installations and handles common issues gracefully.