# Azure Backend API Testing

This notebook allows direct testing of the Azure backend API endpoints.

## MMLU Testing Cell

In [None]:
# MMLU Complete Dataset Benchmarking with Adaptive Backend
import random
import time
from datetime import datetime

import pandas as pd
import requests
from datasets import load_dataset

# Configuration
BASE_URL = "https://backend-dev.mangoplant-a7a21605.swedencentral.azurecontainerapps.io"


def run_complete_mmlu_benchmark():
    """
    Run MMLU benchmark against the adaptive backend on the COMPLETE dataset
    Tests ALL 59 subjects with ALL questions (approximately 14,000+ questions)
    
    WARNING: This will take several hours to complete!
    """
    print("=== COMPLETE MMLU DATASET BENCHMARKING ===")
    print("🚨 WARNING: This will test ~14,000 questions across 59 subjects!")
    print("⏰ Estimated time: 3-6 hours depending on response times")
    print("💰 Cost: Significant API usage costs\n")

    # Confirmation
    print("⚡ Starting in 10 seconds... (stop the cell if you want to cancel)")
    time.sleep(10)

    # Load MMLU dataset
    print("📚 Loading complete MMLU dataset...")
    try:
        dataset = load_dataset("cais/mmlu", "all")
        test_data = dataset["test"]
        print(f"✅ Loaded {len(test_data)} total test questions")
    except Exception as e:
        print(f"❌ Failed to load dataset: {e}")
        return

    # Get all subjects
    all_subjects = list(set(test_data["subject"]))
    all_subjects.sort()  # Sort for consistent order
    print(f"📖 Testing ALL {len(all_subjects)} subjects")
    print(f"📋 Subjects: {', '.join(all_subjects[:10])}{'...' if len(all_subjects) > 10 else ''}")

    results = []
    total_questions = 0
    correct_answers = 0
    start_time = datetime.now()

    headers = {"Content-Type": "application/json"}
    if API_KEY != "your-api-key":
        headers["X-Stainless-API-Key"] = API_KEY

    # Process each subject
    for subject_idx, subject in enumerate(all_subjects, 1):
        print(f"\n🔬 [{subject_idx}/{len(all_subjects)}] Testing subject: {subject}")

        # Filter questions for this subject
        subject_questions = [q for q in test_data if q["subject"] == subject]
        subject_total = len(subject_questions)
        subject_correct = 0

        print(f"   📊 {subject_total} questions in this subject")

        for i, question in enumerate(subject_questions):
            if (i + 1) % 10 == 0 or i == 0:  # Progress indicator
                print(f"   Question {i+1}/{subject_total}...", end=" ")

            # Format question for the API
            choices = [question["choices"][j] for j in range(len(question["choices"]))]
            question_text = f"""Question: {question['question']}
            
A) {choices[0]}
B) {choices[1]}
C) {choices[2]}
D) {choices[3]}

Please answer with only the letter (A, B, C, or D)."""

            # Prepare API request with varied parameters for better testing
            chat_data = {
                "messages": [{"role": "user", "content": question_text}],
                "max_tokens": 10,
                "temperature": 0.1,  # Low temperature for consistent answers
                "provider_constraint": ["openai", "deepseek", "anthropic"],  # Multiple providers
                "cost_bias": random.uniform(0.2, 0.8)  # Vary cost bias to test routing
            }

            try:
                request_start_time = time.time()
                response = requests.post(
                    f"{BASE_URL}/v1/chat/completions",
                    json=chat_data,
                    headers=headers,
                    timeout=60  # Longer timeout for complete test
                )
                response_time = time.time() - request_start_time

                if response.status_code == 200:
                    result = response.json()
                    ai_answer = result["choices"][0]["message"]["content"].strip()

                    # Extract adaptive backend selection info
                    selected_model = result.get("model", "unknown")
                    selected_provider = result.get("provider", "unknown")

                    # Extract letter from AI response
                    ai_letter = None
                    for char in ai_answer.upper():
                        if char in ['A', 'B', 'C', 'D']:
                            ai_letter = char
                            break

                    # Convert correct answer index to letter
                    correct_letter = ['A', 'B', 'C', 'D'][question["answer"]]

                    is_correct = ai_letter == correct_letter
                    if is_correct:
                        subject_correct += 1
                        correct_answers += 1

                    # Store result with detailed backend selection info
                    results.append({
                        "subject": subject,
                        "question_id": f"{subject}_{i}",
                        "question": question["question"][:200] + "...",  # Longer preview
                        "correct_answer": correct_letter,
                        "ai_answer": ai_letter,
                        "ai_response_full": ai_answer,
                        "is_correct": is_correct,
                        "response_time": response_time,
                        "selected_model": selected_model,
                        "selected_provider": selected_provider,
                        "model_provider_combo": f"{selected_provider}/{selected_model}",
                        "completion_tokens": result.get("usage", {}).get("completion_tokens", 0),
                        "prompt_tokens": result.get("usage", {}).get("prompt_tokens", 0),
                        "total_tokens": result.get("usage", {}).get("total_tokens", 0),
                        "cost_bias_used": chat_data["cost_bias"],
                        "timestamp": datetime.now().isoformat()
                    })

                    if (i + 1) % 10 == 0:
                        accuracy_so_far = (subject_correct / (i + 1)) * 100
                        print(f"✅ Accuracy: {accuracy_so_far:.1f}%")

                else:
                    print(f"❌ API Error: {response.status_code}")

            except Exception as e:
                print(f"❌ Request failed: {str(e)}")

            total_questions += 1

            # Save progress every 100 questions
            if total_questions % 100 == 0:
                temp_df = pd.DataFrame(results)
                temp_filename = f"mmlu_progress_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
                temp_df.to_csv(temp_filename, index=False)

                elapsed = datetime.now() - start_time
                rate = total_questions / elapsed.total_seconds() * 60  # questions per minute
                remaining = (len(test_data) - total_questions) / rate if rate > 0 else 0

                print(f"\n💾 Progress saved: {total_questions}/{len(test_data)} questions")
                print(f"⏱️  Rate: {rate:.1f} questions/min | ETA: {remaining:.0f} minutes")

            # Rate limiting
            time.sleep(0.05)  # Small delay to avoid overwhelming the API

        subject_accuracy = (subject_correct / subject_total) * 100
        print(f"   📊 {subject}: {subject_correct}/{subject_total} ({subject_accuracy:.1f}%)")

        # Save intermediate results after each subject
        if results:
            intermediate_df = pd.DataFrame(results)
            intermediate_filename = f"mmlu_intermediate_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
            intermediate_df.to_csv(intermediate_filename, index=False)

    # Calculate final results
    overall_accuracy = (correct_answers / total_questions) * 100 if total_questions > 0 else 0
    total_time = datetime.now() - start_time

    print("\n🎯 FINAL COMPLETE DATASET RESULTS:")
    print(f"📊 Overall Accuracy: {correct_answers}/{total_questions} ({overall_accuracy:.1f}%)")
    print(f"⏱️  Total Time: {total_time}")
    print(f"⚡ Average Response Time: {sum(r['response_time'] for r in results) / len(results):.2f}s")

    # Create comprehensive results DataFrame
    df = pd.DataFrame(results)

    if len(df) > 0:
        print("\n📈 COMPREHENSIVE PERFORMANCE ANALYSIS:")

        # Subject-wise performance
        subject_stats = df.groupby('subject').agg({
            'is_correct': ['count', 'sum', 'mean'],
            'response_time': 'mean',
            'selected_model': lambda x: x.mode().iloc[0] if not x.empty else 'unknown',
            'selected_provider': lambda x: x.mode().iloc[0] if not x.empty else 'unknown',
            'total_tokens': 'sum'
        }).round(3)

        subject_stats.columns = ['Questions', 'Correct', 'Accuracy', 'Avg_Time', 'Most_Used_Model', 'Most_Used_Provider', 'Total_Tokens']

        # Show top and bottom performing subjects
        subject_stats_sorted = subject_stats.sort_values('Accuracy', ascending=False)
        print("\n🏆 TOP 10 PERFORMING SUBJECTS:")
        print(subject_stats_sorted.head(10)[['Questions', 'Correct', 'Accuracy', 'Most_Used_Model']])

        print("\n📉 BOTTOM 10 PERFORMING SUBJECTS:")
        print(subject_stats_sorted.tail(10)[['Questions', 'Correct', 'Accuracy', 'Most_Used_Model']])

        print("\n🤖 COMPLETE ADAPTIVE BACKEND MODEL SELECTION ANALYSIS:")
        model_provider_usage = df['model_provider_combo'].value_counts()
        for combo, count in model_provider_usage.items():
            accuracy = df[df['model_provider_combo'] == combo]['is_correct'].mean() * 100
            avg_time = df[df['model_provider_combo'] == combo]['response_time'].mean()
            print(f"  {combo}: {count:,} questions ({count/len(df)*100:.1f}%) - Accuracy: {accuracy:.1f}% - Avg Time: {avg_time:.2f}s")

        print("\n💰 COMPLETE TOKEN USAGE ANALYSIS:")
        total_tokens_used = df['total_tokens'].sum()
        avg_tokens_per_question = df['total_tokens'].mean()
        print(f"  Total tokens consumed: {total_tokens_used:,}")
        print(f"  Average tokens per question: {avg_tokens_per_question:.1f}")
        print(f"  Estimated API cost (rough): ${total_tokens_used * 0.000002:.2f}")  # Rough estimate

        provider_token_usage = df.groupby('selected_provider')['total_tokens'].agg(['sum', 'mean', 'count']).round(1)
        print("\n📊 TOKEN USAGE BY PROVIDER:")
        for provider in provider_token_usage.index:
            total = provider_token_usage.loc[provider, 'sum']
            avg = provider_token_usage.loc[provider, 'mean']
            count = provider_token_usage.loc[provider, 'count']
            print(f"  {provider}: {total:,} total tokens ({avg:.1f} avg) across {count:,} questions")

        # Cost bias analysis
        print("\n💸 COST BIAS EFFECTIVENESS ANALYSIS:")
        df['cost_bias_bin'] = pd.cut(df['cost_bias_used'], bins=[0, 0.3, 0.7, 1.0], labels=['Low (0-0.3)', 'Med (0.3-0.7)', 'High (0.7-1.0)'])
        cost_bias_analysis = df.groupby('cost_bias_bin').agg({
            'selected_model': lambda x: x.mode().iloc[0] if not x.empty else 'unknown',
            'selected_provider': lambda x: x.mode().iloc[0] if not x.empty else 'unknown',
            'is_correct': 'mean',
            'total_tokens': 'mean'
        }).round(3)
        print(cost_bias_analysis)

        # Save final comprehensive results
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"mmlu_complete_benchmark_results_{timestamp}.csv"
        df.to_csv(filename, index=False)

        # Save summary statistics
        summary_filename = f"mmlu_complete_summary_{timestamp}.csv"
        subject_stats.to_csv(summary_filename)

        print("\n💾 COMPLETE RESULTS SAVED:")
        print(f"  📋 Detailed results: {filename}")
        print(f"  📊 Subject summary: {summary_filename}")

        print("\n🔄 ADAPTIVE ROUTING EFFECTIVENESS (COMPLETE ANALYSIS):")
        print(f"  Total questions tested: {len(df):,}")
        print(f"  Subjects covered: {len(df['subject'].unique())}")
        print(f"  Unique model/provider combinations used: {len(model_provider_usage)}")
        print(f"  Most frequently selected: {model_provider_usage.index[0] if len(model_provider_usage) > 0 else 'N/A'}")
        print(f"  Routing diversity: {len(model_provider_usage)} different combinations")

        # Performance by subject category analysis
        print("\n📚 SUBJECT CATEGORY ANALYSIS:")
        # Group subjects by domain (rough categorization)
        science_subjects = [s for s in df['subject'].unique() if any(term in s.lower() for term in ['physics', 'chemistry', 'biology', 'astronomy', 'anatomy'])]
        math_subjects = [s for s in df['subject'].unique() if any(term in s.lower() for term in ['math', 'statistics', 'calculus', 'algebra', 'geometry'])]
        humanities_subjects = [s for s in df['subject'].unique() if any(term in s.lower() for term in ['history', 'philosophy', 'literature', 'religion'])]

        for category, subjects in [('Science', science_subjects), ('Mathematics', math_subjects), ('Humanities', humanities_subjects)]:
            if subjects:
                category_df = df[df['subject'].isin(subjects)]
                if len(category_df) > 0:
                    category_accuracy = category_df['is_correct'].mean() * 100
                    print(f"  {category}: {category_accuracy:.1f}% accuracy across {len(subjects)} subjects")

    print("\n✨ COMPLETE MMLU BENCHMARK FINISHED!")
    print(f"🏁 Total time: {total_time}")
    print(f"📊 Overall performance: {overall_accuracy:.1f}% accuracy")
    return df

# Run the complete benchmark
print("🚀 Starting COMPLETE MMLU dataset benchmark...")
print("📋 This will test your adaptive backend against ALL ~14,000 academic questions")
print("🔍 Comprehensive analysis of model selection across all 59 subjects")
print("⚠️  WARNING: This will take several hours and consume significant API credits!\n")

# Run the complete benchmark
complete_benchmark_results = run_complete_mmlu_benchmark()

print("\n🎉 COMPLETE BENCHMARK ANALYSIS FINISHED!")

In [56]:

BASE_URL = "https://backend-dev.mangoplant-a7a21605.swedencentral.azurecontainerapps.io"
PYTHON_SERVICE_URL = "https://prompt-classifer-dev.mangoplant-a7a21605.swedencentral.azurecontainerapps.io"

def test_service(url, endpoint, data, service_name):
    """Test a service and return the result"""
    full_url = f"{url.rstrip('/')}/{endpoint.lstrip('/')}"
    headers = {"Content-Type": "application/json"}

    print(f"\n🔍 Testing {service_name}: {endpoint}")
    start_time = time.time()

    try:
        response = requests.post(full_url, headers=headers, json=data, timeout=15)
        response_time = time.time() - start_time

        print(f"✅ Status: {response.status_code} | Time: {response_time:.2f}s")
        result = response.json()

        # Extract key info
        if service_name == "Python AI Service":
            protocol = result.get("protocol", "unknown")
            model_info = result.get("minion", {}).get("model") or result.get("standard", {}).get("model")
            provider_info = result.get("minion", {}).get("provider") or result.get("standard", {}).get("provider")
            print(f"📄 Protocol: {protocol} | Model: {provider_info}/{model_info}")
        else:  # Go Backend
            model = result.get("model", "unknown")
            provider = result.get("provider", "unknown")
            print(f"📄 Final Selection: {provider}/{model}")

        return result

    except Exception as e:
        print(f"❌ Error: {str(e)}")
        return None

# Test prompt
test_prompt = "Explain quantum computing in simple terms."

print("=== MODEL SELECTION COMPARISON TEST ===")
print(f"🧪 Test Prompt: '{test_prompt}'")

# Test Python AI Service directly
python_data = {
    "messages": [{"role": "user", "content": test_prompt}],
    "provider_constraint": ["openai", "deepseek"],
    "cost_bias": 0.8
}

#python_result = test_service(PYTHON_SERVICE_URL, "/predict", python_data, "Python AI Service")
python_result = None
# Test Go Backend (which should call Python service internally)
go_data = {
    "messages": [{"role": "user", "content": test_prompt}],
    "provider_constraint": ["openai", "deepseek"],
    "cost_bias": 0.8
}

go_result = test_service(BASE_URL, "/v1/chat/completions", go_data, "Go Backend")

# Compare results
print(f"\n{'='*50}")
print("🔍 COMPARISON ANALYSIS:")

if python_result or go_result:
    # Extract models for comparison
    python_model = None
    go_model = go_result.get("model")

    if python_result.get("protocol") == "minion":
        python_model = f"{python_result.get('minion', {}).get('provider')}/{python_result.get('minion', {}).get('model')}"
    elif python_result.get("protocol") == "standard":
        python_model = f"{python_result.get('standard', {}).get('provider')}/{python_result.get('standard', {}).get('model')}"

    go_model_full = f"{go_result.get('provider')}/{go_result.get('model')}"

    print(f"🐍 Python Service Suggests: {python_model}")
    print(f"🔧 Go Backend Actually Used: {go_model_full}")

    if python_model and python_model.lower() in go_model_full.lower():
        print("✅ MATCH: Go backend used Python service recommendation!")
    else:
        print("⚠️  DIFFERENT: Go backend used different model (fallback or override)")
else:
    print("❌ Cannot compare - one or both services failed")

print("\n✨ Comparison test complete!")


=== MODEL SELECTION COMPARISON TEST ===
🧪 Test Prompt: 'Explain quantum computing in simple terms.'

🔍 Testing Go Backend: /v1/chat/completions
✅ Status: 200 | Time: 5.40s
📄 Final Selection: openai/gpt-4o-mini-2024-07-18

🔍 COMPARISON ANALYSIS:


AttributeError: 'NoneType' object has no attribute 'get'