# Simple Protocol Testing

Get 100 samples from HuggingFace dataset and test them on adaptive_ai `/predict` endpoint.

In [13]:
# Simple Protocol Testing - 100 samples from HuggingFace to /predict
import requests
import time
import pandas as pd
from datasets import load_dataset

print("🚀 Simple Protocol Testing")
print("📊 Getting 100 samples from HuggingFace and testing on /predict")

# Configuration
SERVICE_URL = "http://localhost:8000"
SAMPLE_COUNT = 100

# Check service is running
try:
    response = requests.get(f"{SERVICE_URL}/health", timeout=5)
    if response.status_code == 200:
        print("✅ Service is running")
    else:
        print(f"❌ Service error: {response.status_code}")
        exit()
except Exception as e:
    print(f"❌ Service not available: {e}")
    print("💡 Start service: cd adaptive_ai && uv run python -m adaptive_ai.main")
    exit()

# Load dataset and get samples
print(f"\n📥 Loading {SAMPLE_COUNT} samples from HuggingFace...")
dataset = load_dataset("routellm/gpt4_dataset", split="validation", streaming=True)

prompts = []
for i, item in enumerate(dataset):
    if i >= SAMPLE_COUNT:
        break
    if item.get("prompt"):
        prompts.append(item["prompt"])

    if (i + 1) % 25 == 0:
        print(f"   📊 Loaded {i + 1}/{SAMPLE_COUNT} samples...")

print(f"✅ Got {len(prompts)} prompts")

# Test each prompt on /predict
print(f"\n🧪 Testing {len(prompts)} prompts on {SERVICE_URL}/predict...")
results = []

for i, prompt in enumerate(prompts):
    request_data = {"messages": [{"role": "user", "content": prompt}]}

    start_time = time.time()
    try:
        response = requests.post(
            f"{SERVICE_URL}/predict", json=request_data, timeout=30
        )

        execution_time = time.time() - start_time

        if response.status_code == 200:
            result = response.json()
            protocol = result.get("protocol", "unknown")

            # Extract model info
            model = "unknown"
            provider = "unknown"

            if protocol == "minion" and "minion" in result:
                model = result["minion"].get("model", "unknown")
                provider = result["minion"].get("provider", "unknown")
            elif protocol == "standard" and "standard" in result:
                model = result["standard"].get("model", "unknown")
                provider = result["standard"].get("provider", "unknown")

            results.append(
                {
                    "prompt": prompt[:100] + "..." if len(prompt) > 100 else prompt,
                    "success": True,
                    "protocol": protocol,
                    "model": model,
                    "provider": provider,
                    "response_time": execution_time,
                }
            )
        else:
            results.append(
                {
                    "prompt": prompt[:100] + "..." if len(prompt) > 100 else prompt,
                    "success": False,
                    "protocol": "failed",
                    "model": "failed",
                    "provider": "failed",
                    "response_time": execution_time,
                    "error": f"HTTP {response.status_code}",
                }
            )

    except Exception as e:
        execution_time = time.time() - start_time
        results.append(
            {
                "prompt": prompt[:100] + "..." if len(prompt) > 100 else prompt,
                "success": False,
                "protocol": "error",
                "model": "error",
                "provider": "error",
                "response_time": execution_time,
                "error": str(e),
            }
        )

    # Progress update
    if (i + 1) % 20 == 0:
        successful = sum(1 for r in results if r["success"])
        print(
            f"   📊 Tested {i + 1}/{len(prompts)} - Success: {successful}/{i + 1} ({successful/(i+1)*100:.1f}%)"
        )

print("\n✅ Testing completed!")

# Results summary
df = pd.DataFrame(results)
successful = df[df["success"]]
failed = df[~df["success"]]

print(f"\n📊 RESULTS SUMMARY")
print(
    f"✅ Successful: {len(successful)}/{len(df)} ({len(successful)/len(df)*100:.1f}%)"
)
print(f"❌ Failed: {len(failed)}/{len(df)} ({len(failed)/len(df)*100:.1f}%)")

if len(successful) > 0:
    avg_time = successful["response_time"].mean()
    print(f"⏱️ Average response time: {avg_time:.3f}s")

    # Protocol usage
    protocol_counts = successful["protocol"].value_counts()
    print(f"\n🔄 Protocol Usage:")
    for protocol, count in protocol_counts.items():
        percentage = count / len(successful) * 100
        print(f"   {protocol}: {count} ({percentage:.1f}%)")

    # Model usage
    model_counts = successful["model"].value_counts()
    print(f"\n🤖 Top Models:")
    for model, count in model_counts.head(5).items():
        percentage = count / len(successful) * 100
        print(f"   {model}: {count} ({percentage:.1f}%)")

    # Provider usage
    provider_counts = successful["provider"].value_counts()
    print(f"\n🏢 Provider Usage:")
    for provider, count in provider_counts.items():
        percentage = count / len(successful) * 100
        print(f"   {provider}: {count} ({percentage:.1f}%)")

# Show some failures if any
if len(failed) > 0:
    print(f"\n❌ Sample Failures:")
    for i, row in failed.head(3).iterrows():
        print(f"   {i+1}. {row['error']} - {row['prompt'][:50]}...")

print(f"\n🎉 Protocol testing completed!")
print(f"📈 Dataset: routellm/gpt4_dataset (first {SAMPLE_COUNT} samples)")
print(f"🎯 Service: {SERVICE_URL}/predict")
print(f"🗃️ Results stored in 'df' DataFrame for further analysis")

🚀 Simple Protocol Testing
📊 Getting 100 samples from HuggingFace and testing on /predict
✅ Service is running

📥 Loading 100 samples from HuggingFace...
   📊 Loaded 25/100 samples...
   📊 Loaded 50/100 samples...
   📊 Loaded 75/100 samples...
   📊 Loaded 100/100 samples...
✅ Got 100 prompts

🧪 Testing 100 prompts on http://localhost:8000/predict...
   📊 Tested 20/100 - Success: 6/20 (30.0%)
   📊 Tested 40/100 - Success: 14/40 (35.0%)
   📊 Tested 60/100 - Success: 22/60 (36.7%)
   📊 Tested 80/100 - Success: 26/80 (32.5%)
   📊 Tested 100/100 - Success: 31/100 (31.0%)

✅ Testing completed!

📊 RESULTS SUMMARY
✅ Successful: 31/100 (31.0%)
❌ Failed: 69/100 (69.0%)
⏱️ Average response time: 0.185s

🔄 Protocol Usage:
   minion: 26 (83.9%)
   standard_llm: 5 (16.1%)

🤖 Top Models:
   microsoft/DialoGPT-medium: 15 (48.4%)
   microsoft/codebert-base: 6 (19.4%)
   unknown: 5 (16.1%)
   allenai/scibert_scivocab_uncased: 3 (9.7%)
   gpt2-medium: 2 (6.5%)

🏢 Provider Usage:
   groq: 26 (83.9%)
   unk

In [1]:
# DEBUG VERSION - Find HTTP 500 causes
import requests
import time
import pandas as pd
from datasets import load_dataset
import json

print("🔍 DEBUG MODE - Finding HTTP 500 causes")
print("📊 Testing with detailed error reporting")

# Configuration
SERVICE_URL = "http://localhost:8000"
SAMPLE_COUNT = 10  # Small sample for debugging

# Check service is running
try:
    response = requests.get(f"{SERVICE_URL}/health", timeout=5)
    if response.status_code == 200:
        print("✅ Service is running")
        print(f"📋 Health response: {response.text}")
    else:
        print(f"❌ Service error: {response.status_code}")
        print(f"📋 Response: {response.text}")
        exit()
except Exception as e:
    print(f"❌ Service not available: {e}")
    print("💡 Start service: cd adaptive_ai && uv run python -m adaptive_ai.main")
    exit()

# Load dataset and get samples
print(f"\n📥 Loading {SAMPLE_COUNT} samples from HuggingFace...")
dataset = load_dataset("routellm/gpt4_dataset", split="validation", streaming=True)

prompts = []
for i, item in enumerate(dataset):
    if i >= SAMPLE_COUNT:
        break
    if item.get("prompt"):
        prompts.append(item["prompt"])

print(f"✅ Got {len(prompts)} prompts")

# Test each prompt with detailed debugging
print(f"\n🔍 DEBUGGING {len(prompts)} prompts with detailed error analysis...")
results = []

for i, prompt in enumerate(prompts):
    print(f"\n🧪 Testing prompt {i+1}/{len(prompts)}:")
    print(f"📝 Prompt length: {len(prompt)} chars")
    print(f"📝 Prompt preview: {prompt[:80]}...")

    # Check for potential problematic characters
    has_unicode = any(ord(char) > 127 for char in prompt)
    has_newlines = "\n" in prompt
    has_quotes = '"' in prompt or "'" in prompt

    print(
        f"📊 Prompt analysis: Unicode={has_unicode}, Newlines={has_newlines}, Quotes={has_quotes}"
    )

    request_data = {"messages": [{"role": "user", "content": prompt}]}

    print(f"📡 Request size: {len(json.dumps(request_data))} bytes")

    start_time = time.time()
    try:
        response = requests.post(
            f"{SERVICE_URL}/predict",
            json=request_data,
            timeout=30,
            headers={"Content-Type": "application/json"},
        )

        execution_time = time.time() - start_time
        print(f"⏱️ Response time: {execution_time:.3f}s")
        print(f"📊 Status code: {response.status_code}")

        if response.status_code == 200:
            try:
                result = response.json()
                protocol = result.get("protocol", "unknown")
                print(f"✅ SUCCESS - Protocol: {protocol}")

                # Extract model info
                model = "unknown"
                provider = "unknown"

                if protocol == "minion" and "minion" in result:
                    model = result["minion"].get("model", "unknown")
                    provider = result["minion"].get("provider", "unknown")
                elif protocol == "standard" and "standard" in result:
                    model = result["standard"].get("model", "unknown")
                    provider = result["standard"].get("provider", "unknown")

                print(f"🤖 Model: {model}")
                print(f"🏢 Provider: {provider}")

                results.append(
                    {
                        "prompt_id": i + 1,
                        "success": True,
                        "protocol": protocol,
                        "model": model,
                        "provider": provider,
                        "response_time": execution_time,
                        "status_code": response.status_code,
                        "prompt_length": len(prompt),
                        "has_unicode": has_unicode,
                        "has_newlines": has_newlines,
                        "has_quotes": has_quotes,
                    }
                )
            except json.JSONDecodeError as e:
                print(f"❌ JSON decode error: {e}")
                print(f"📋 Raw response: {response.text[:200]}...")
                results.append(
                    {
                        "prompt_id": i + 1,
                        "success": False,
                        "error": f"JSON decode error: {e}",
                        "status_code": response.status_code,
                        "response_time": execution_time,
                        "prompt_length": len(prompt),
                        "raw_response": response.text[:500],
                    }
                )
        else:
            print(f"❌ HTTP ERROR {response.status_code}")
            print(f"📋 Error response: {response.text[:200]}...")

            # Try to get more details from the error response
            error_details = "Unknown error"
            try:
                error_json = response.json()
                error_details = str(error_json)
            except:
                error_details = response.text[:200]

            results.append(
                {
                    "prompt_id": i + 1,
                    "success": False,
                    "error": f"HTTP {response.status_code}",
                    "error_details": error_details,
                    "status_code": response.status_code,
                    "response_time": execution_time,
                    "prompt_length": len(prompt),
                    "has_unicode": has_unicode,
                    "has_newlines": has_newlines,
                    "has_quotes": has_quotes,
                }
            )

    except Exception as e:
        execution_time = time.time() - start_time
        print(f"❌ EXCEPTION: {e}")
        results.append(
            {
                "prompt_id": i + 1,
                "success": False,
                "error": f"Exception: {e}",
                "response_time": execution_time,
                "prompt_length": len(prompt),
            }
        )

    print(f"{'='*60}")

print("\n🔍 DEBUGGING COMPLETED!")

# Detailed analysis
df_debug = pd.DataFrame(results)
successful = df_debug[df_debug["success"]]
failed = df_debug[~df_debug["success"]]

print(f"\n📊 DETAILED DEBUG RESULTS")
print(
    f"✅ Successful: {len(successful)}/{len(df_debug)} ({len(successful)/len(df_debug)*100:.1f}%)"
)
print(
    f"❌ Failed: {len(failed)}/{len(df_debug)} ({len(failed)/len(df_debug)*100:.1f}%)"
)

if len(failed) > 0:
    print(f"\n❌ FIRST 3 FAILURES WITH DETAILS:")
    for i, row in failed.head(3).iterrows():
        print(f"\n   Failure {row['prompt_id']}:")
        print(f"   📊 Length: {row['prompt_length']} chars")
        print(f"   🔢 Status: {row['status_code']}")
        print(f"   ❌ Error: {row['error']}")
        if "error_details" in row and pd.notna(row["error_details"]):
            print(f"   📋 Details: {str(row['error_details'])[:150]}...")

print(f"\n🎯 DEBUG COMPLETE - Check server logs for more details!")
print(f"🗃️ Debug data available in 'df_debug' DataFrame")

🔍 DEBUG MODE - Finding HTTP 500 causes
📊 Testing with detailed error reporting
✅ Service is running
📋 Health response: ok

📥 Loading 10 samples from HuggingFace...
✅ Got 10 prompts

🔍 DEBUGGING 10 prompts with detailed error analysis...

🧪 Testing prompt 1/10:
📝 Prompt length: 60 chars
📝 Prompt preview: Write c++ code, which calculates and outputs n digits of pi....
📊 Prompt analysis: Unicode=False, Newlines=False, Quotes=False
📡 Request size: 107 bytes
⏱️ Response time: 0.670s
📊 Status code: 200
✅ SUCCESS - Protocol: standard_llm
🤖 Model: unknown
🏢 Provider: unknown

🧪 Testing prompt 2/10:
📝 Prompt length: 957 chars
📝 Prompt preview: [Partner Cooperation Team] Share webinar schedule for Cafe24 employees
hello. Th...
📊 Prompt analysis: Unicode=False, Newlines=True, Quotes=True
📡 Request size: 1017 bytes
⏱️ Response time: 0.629s
📊 Status code: 200
✅ SUCCESS - Protocol: minion
🤖 Model: gpt2-medium
🏢 Provider: groq

🧪 Testing prompt 3/10:
📝 Prompt length: 104 chars
📝 Prompt preview: Write

In [4]:
# Test Different Prompt Types - Benchmark-Optimized Rankings
import requests
import json
import time

print("🧪 Testing Benchmark-Optimized Task Rankings")
print("📊 Testing different task types to see model selection")

# Test endpoint
url = "http://localhost:8000/predict"

# Test prompts for different task types
test_prompts = [
    {
        "name": "BRAINSTORMING",
        "prompt": "Generate 5 creative ideas for a sustainable transportation startup that could revolutionize urban mobility",
        "expected_task": "BRAINSTORMING",
    },
    {
        "name": "OPEN_QA",
        "prompt": "What are the main causes of climate change and how do they interact with each other?",
        "expected_task": "OPEN_QA",
    },
    {
        "name": "CODE_GENERATION",
        "prompt": "Write a Python function that implements a binary search algorithm with proper error handling",
        "expected_task": "CODE_GENERATION",
    },
    {
        "name": "CLOSED_QA",
        "prompt": "Based on the context of renewable energy, what is the efficiency of modern solar panels?",
        "expected_task": "CLOSED_QA",
    },
    {
        "name": "TEXT_GENERATION",
        "prompt": "Write a compelling product description for a smart home device that learns user preferences",
        "expected_task": "TEXT_GENERATION",
    },
    {
        "name": "CLASSIFICATION",
        "prompt": "Classify this email as spam or not spam: 'Congratulations! You've won $1000000! Click here to claim now!'",
        "expected_task": "CLASSIFICATION",
    },
    {
        "name": "SUMMARIZATION",
        "prompt": "Summarize the key points from this article about artificial intelligence developments in 2024: AI has made significant advances in reasoning capabilities, with models like GPT-4o and Claude showing improved performance on complex tasks.",
        "expected_task": "SUMMARIZATION",
    },
]


def test_prompt(prompt_data):
    """Test a single prompt and return results"""
    payload = {"messages": [{"role": "user", "content": prompt_data["prompt"]}]}

    try:
        response = requests.post(url, json=payload, timeout=30)
        if response.status_code == 200:
            result = response.json()
            return {
                "status": "success",
                "prompt_name": prompt_data["name"],
                "expected_task": prompt_data["expected_task"],
                "actual_task": (
                    result.get("task_classification", {}).get(
                        "task_type_1", ["unknown"]
                    )[0]
                    if result.get("task_classification")
                    else "unknown"
                ),
                "protocol": result.get("protocol", "unknown"),
                "provider": result.get("provider", "unknown"),
                "model": result.get("model", "unknown"),
                "domain": (
                    result.get("domain_classification", {}).get("domain", "unknown")
                    if result.get("domain_classification")
                    else "unknown"
                ),
                "full_response": result,
            }
        else:
            return {
                "status": "error",
                "prompt_name": prompt_data["name"],
                "error": f"HTTP {response.status_code}: {response.text}",
            }
    except Exception as e:
        return {"status": "error", "prompt_name": prompt_data["name"], "error": str(e)}


print("=" * 70)

results = []
for i, prompt_data in enumerate(test_prompts, 1):
    print(f"\n[{i}/{len(test_prompts)}] Testing {prompt_data['name']}...")
    print(f"Prompt: {prompt_data['prompt'][:80]}...")

    result = test_prompt(prompt_data)
    results.append(result)

    if result["status"] == "success":
        print(
            f"✅ Task: {result['actual_task']} | Protocol: {result['protocol']} | Model: {result['provider']}:{result['model']}"
        )
        if result["expected_task"] != result["actual_task"]:
            print(f"⚠️  Expected {result['expected_task']}, got {result['actual_task']}")
        print(f"🌍 Domain: {result['domain']}")
    else:
        print(f"❌ Error: {result['error']}")

print("\n" + "=" * 70)
print("📊 BENCHMARK-OPTIMIZED RANKING RESULTS")
print("=" * 70)

successful_tests = [r for r in results if r["status"] == "success"]

if successful_tests:
    print(f"✅ Successful tests: {len(successful_tests)}/{len(test_prompts)}")

    # Task classification accuracy
    correct_classifications = sum(
        1 for r in successful_tests if r["expected_task"] == r["actual_task"]
    )
    print(
        f"🎯 Task classification accuracy: {correct_classifications}/{len(successful_tests)} ({correct_classifications/len(successful_tests)*100:.1f}%)"
    )

    # Model selection distribution
    model_usage = {}
    protocol_usage = {}
    task_model_combinations = {}

    for r in successful_tests:
        model_key = f"{r['provider']}:{r['model']}"
        model_usage[model_key] = model_usage.get(model_key, 0) + 1
        protocol_usage[r["protocol"]] = protocol_usage.get(r["protocol"], 0) + 1

        # Track task-model combinations to verify benchmark optimization
        task_model_key = f"{r['actual_task']} → {model_key}"
        task_model_combinations[task_model_key] = (
            task_model_combinations.get(task_model_key, 0) + 1
        )

    print(f"\n📈 Protocol usage:")
    for protocol, count in protocol_usage.items():
        print(f"  {protocol}: {count}")

    print(f"\n🤖 Model usage:")
    for model, count in sorted(model_usage.items(), key=lambda x: x[1], reverse=True):
        print(f"  {model}: {count}")

    print(f"\n🎯 Task → Model assignments (verifying benchmark optimization):")
    for task_model, count in sorted(task_model_combinations.items()):
        print(f"  {task_model}: {count}")

    # Show specific benchmark-optimized selections
    print(f"\n🏆 Benchmark optimization verification:")
    for r in successful_tests:
        task = r["actual_task"]
        model = r["model"]
        if task == "BRAINSTORMING" and ("o1" in model or "reasoner" in model):
            print(f"  ✅ {task}: Using reasoning model {model} (benchmark-optimized)")
        elif task == "OPEN_QA" and "grok" in model.lower():
            print(f"  ✅ {task}: Using Grok {model} (92.7% MMLU leader)")
        elif task == "CODE_GENERATION" and "deepseek" in model.lower():
            print(f"  ✅ {task}: Using DeepSeek {model} (HumanEval leader)")
        elif task in ["CLASSIFICATION", "SUMMARIZATION"] and "mini" in model.lower():
            print(f"  ✅ {task}: Using efficient model {model} (cost-optimized)")
else:
    print("❌ No successful tests")

# Show any errors
error_tests = [r for r in results if r["status"] == "error"]
if error_tests:
    print(f"\n❌ Failed tests: {len(error_tests)}")
    for r in error_tests:
        print(f"  {r['prompt_name']}: {r['error']}")

print(f"\n🎉 Benchmark-optimized ranking test completed!")
print(
    f"📈 Rankings updated based on MMLU-Pro, HumanEval, GSM8K, GPQA, TruthfulQA benchmarks"
)
print(f"🗃️ Results stored in 'results' list for further analysis")

🧪 Testing Benchmark-Optimized Task Rankings
📊 Testing different task types to see model selection

[1/7] Testing BRAINSTORMING...
Prompt: Generate 5 creative ideas for a sustainable transportation startup that could re...
❌ Error: HTTP 500: {"detail":"Internal server error"}

[2/7] Testing OPEN_QA...
Prompt: What are the main causes of climate change and how do they interact with each ot...
✅ Task: unknown | Protocol: minion | Model: unknown:unknown
⚠️  Expected OPEN_QA, got unknown
🌍 Domain: unknown

[3/7] Testing CODE_GENERATION...
Prompt: Write a Python function that implements a binary search algorithm with proper er...
✅ Task: unknown | Protocol: minion | Model: unknown:unknown
⚠️  Expected CODE_GENERATION, got unknown
🌍 Domain: unknown

[4/7] Testing CLOSED_QA...
Prompt: Based on the context of renewable energy, what is the efficiency of modern solar...
✅ Task: unknown | Protocol: minion | Model: unknown:unknown
⚠️  Expected CLOSED_QA, got unknown
🌍 Domain: unknown

[5/7] Testing