# Azure Backend API Testing

This notebook allows direct testing of the Azure backend API endpoints.

## MMLU Testing Cell

In [29]:
# MMLU Single Subject Benchmarking with Adaptive Backend
import random
import time
from datetime import datetime

import pandas as pd
import requests
from datasets import load_dataset

# Configuration
BASE_URL = "https://backend-dev.mangoplant-a7a21605.swedencentral.azurecontainerapps.io"

# Focus on the MOST IMPORTANT subject for AI benchmarking
TARGET_SUBJECT = "high_school_mathematics"  # Math is the gold standard for AI reasoning


def run_single_subject_mmlu_benchmark():
    """
    Run MMLU benchmark on HIGH SCHOOL MATHEMATICS only
    ~100 questions, perfect for quick validation
    
    Estimated time: 5-15 minutes
    """
    print("=== SINGLE SUBJECT MMLU BENCHMARK ===")
    print(f"🎯 Testing ONLY: {TARGET_SUBJECT}")
    print("📊 Math is the gold standard for AI reasoning capabilities")
    print("⏰ Estimated time: 5-15 minutes")
    print("💰 Cost: Very low API usage\n")

    # Quick start
    print("▶️  Starting in 2 seconds...")
    time.sleep(2)

    # Load MMLU dataset
    print("📚 Loading MMLU dataset...")
    try:
        dataset = load_dataset("cais/mmlu", "all")
        test_data = dataset["test"]
        print(f"✅ Loaded {len(test_data)} total test questions")
    except Exception as e:
        print(f"❌ Failed to load dataset: {e}")
        return

    # Filter for target subject only
    subject_questions = [q for q in test_data if q["subject"] == TARGET_SUBJECT]
    
    if not subject_questions:
        print(f"❌ No questions found for {TARGET_SUBJECT}")
        return
        
    total_questions = len(subject_questions)
    print(f"🎯 Found {total_questions} questions for {TARGET_SUBJECT}")

    results = []
    correct_answers = 0
    start_time = datetime.now()

    # Create session for connection reuse and better performance
    session = requests.Session()
    session.headers.update({"Content-Type": "application/json"})

    print(f"\n🔬 Testing subject: {TARGET_SUBJECT}")
    print(f"   📊 Processing {total_questions} questions...")

    for i, question in enumerate(subject_questions):
        if (i + 1) % 10 == 0 or i == 0:  # Progress every 10 questions
            print(f"   Question {i+1}/{total_questions}...", end=" ")

        # Format question for the API
        choices = [question["choices"][j] for j in range(len(question["choices"]))]
        question_text = f"""Question: {question['question']}
        
A) {choices[0]}
B) {choices[1]}
C) {choices[2]}
D) {choices[3]}

Please answer with only the letter (A, B, C, or D)."""

        # Prepare API request with varied parameters for testing
        chat_data = {
            "messages": [{"role": "user", "content": question_text}],
            "max_tokens": 10,
            "temperature": 0.1,  # Low temperature for consistent answers
            "provider_constraint": ["openai", "deepseek", "anthropic"],  # Multiple providers
            "cost_bias": random.uniform(0.2, 0.8)  # Vary cost bias to test routing
        }

        # Retry logic for failed requests
        max_retries = 3
        retry_delay = 1
        
        for attempt in range(max_retries):
            try:
                request_start_time = time.time()
                response = session.post(
                    f"{BASE_URL}/v1/chat/completions",
                    json=chat_data,
                    timeout=90  # Increased timeout to 90 seconds
                )
            response_time = time.time() - request_start_time

            if response.status_code == 200:
                result = response.json()
                ai_answer = result["choices"][0]["message"]["content"].strip()

                # Extract adaptive backend selection info
                selected_model = result.get("model", "unknown")
                selected_provider = result.get("provider", "unknown")

                # Extract letter from AI response
                ai_letter = None
                for char in ai_answer.upper():
                    if char in ['A', 'B', 'C', 'D']:
                        ai_letter = char
                        break

                # Convert correct answer index to letter
                correct_letter = ['A', 'B', 'C', 'D'][question["answer"]]

                is_correct = ai_letter == correct_letter
                if is_correct:
                    correct_answers += 1

                # Store result with detailed backend selection info
                results.append({
                    "subject": TARGET_SUBJECT,
                    "question_id": f"{TARGET_SUBJECT}_{i}",
                    "question": question["question"][:200] + "...",
                    "correct_answer": correct_letter,
                    "ai_answer": ai_letter,
                    "ai_response_full": ai_answer,
                    "is_correct": is_correct,
                    "response_time": response_time,
                    "selected_model": selected_model,
                    "selected_provider": selected_provider,
                    "model_provider_combo": f"{selected_provider}/{selected_model}",
                    "completion_tokens": result.get("usage", {}).get("completion_tokens", 0),
                    "prompt_tokens": result.get("usage", {}).get("prompt_tokens", 0),
                    "total_tokens": result.get("usage", {}).get("total_tokens", 0),
                    "cost_bias_used": chat_data["cost_bias"],
                    "timestamp": datetime.now().isoformat()
                })

                if (i + 1) % 10 == 0:
                    accuracy_so_far = (correct_answers / (i + 1)) * 100
                    print(f"✅ Accuracy: {accuracy_so_far:.1f}%")

            else:
                print(f"❌ API Error: {response.status_code}")
                # Still record failed attempts
                results.append({
                    "subject": TARGET_SUBJECT,
                    "question_id": f"{TARGET_SUBJECT}_{i}",
                    "question": question["question"][:200] + "...",
                    "correct_answer": ['A', 'B', 'C', 'D'][question["answer"]],
                    "ai_answer": None,
                    "ai_response_full": f"API Error: {response.status_code}",
                    "is_correct": False,
                    "response_time": 0,
                    "selected_model": "error",
                    "selected_provider": "error",
                    "model_provider_combo": "error/error",
                    "completion_tokens": 0,
                    "prompt_tokens": 0,
                    "total_tokens": 0,
                    "cost_bias_used": chat_data["cost_bias"],
                    "timestamp": datetime.now().isoformat()
                })

        except Exception as e:
            print(f"❌ Request failed: {str(e)}")
            # Record failed requests
            results.append({
                "subject": TARGET_SUBJECT,
                "question_id": f"{TARGET_SUBJECT}_{i}",
                "question": question["question"][:200] + "...",
                "correct_answer": ['A', 'B', 'C', 'D'][question["answer"]],
                "ai_answer": None,
                "ai_response_full": f"Exception: {str(e)}",
                "is_correct": False,
                "response_time": 0,
                "selected_model": "failed",
                "selected_provider": "failed",
                "model_provider_combo": "failed/failed",
                "completion_tokens": 0,
                "prompt_tokens": 0,
                "total_tokens": 0,
                "cost_bias_used": chat_data["cost_bias"],
                "timestamp": datetime.now().isoformat()
            })

        # Small delay to avoid overwhelming the API
        time.sleep(0.03)

    # Calculate final results
    overall_accuracy = (correct_answers / len(results)) * 100 if results else 0
    total_time = datetime.now() - start_time
    successful_requests = sum(1 for r in results if r["selected_model"] not in ["error", "failed"])

    print(f"\n   📊 {TARGET_SUBJECT}: {correct_answers}/{len(results)} ({overall_accuracy:.1f}%)")

    print("\n🎯 SINGLE SUBJECT BENCHMARK RESULTS:")
    print(f"📊 Overall Accuracy: {correct_answers}/{len(results)} ({overall_accuracy:.1f}%)")
    print(f"⏱️  Total Time: {total_time}")
    print(f"✅ Successful Requests: {successful_requests}/{len(results)} ({successful_requests/len(results)*100:.1f}%)")
    if successful_requests > 0:
        avg_response_time = sum(r['response_time'] for r in results if r['response_time'] > 0) / successful_requests
        print(f"⚡ Average Response Time: {avg_response_time:.2f}s")

    # Create comprehensive results DataFrame
    df = pd.DataFrame(results)

    if len(df) > 0:
        print("\n📈 MATHEMATICS PERFORMANCE ANALYSIS:")

        # Filter successful requests for analysis
        successful_df = df[df['selected_model'].isin(['error', 'failed']) == False]
        
        if len(successful_df) > 0:
            print("\n🤖 ADAPTIVE BACKEND MODEL SELECTION:")
            model_provider_usage = successful_df['model_provider_combo'].value_counts()
            for combo, count in model_provider_usage.items():
                accuracy = successful_df[successful_df['model_provider_combo'] == combo]['is_correct'].mean() * 100
                avg_time = successful_df[successful_df['model_provider_combo'] == combo]['response_time'].mean()
                print(f"  {combo}: {count:,} questions ({count/len(successful_df)*100:.1f}%) - Accuracy: {accuracy:.1f}% - Avg Time: {avg_time:.2f}s")

            print("\n💰 TOKEN USAGE ANALYSIS:")
            total_tokens_used = successful_df['total_tokens'].sum()
            avg_tokens_per_question = successful_df['total_tokens'].mean()
            print(f"  Total tokens consumed: {total_tokens_used:,}")
            print(f"  Average tokens per question: {avg_tokens_per_question:.1f}")
            print(f"  Estimated API cost (rough): ${total_tokens_used * 0.000002:.4f}")

            if len(successful_df) > 1:  # Need multiple rows for groupby
                provider_token_usage = successful_df.groupby('selected_provider')['total_tokens'].agg(['sum', 'mean', 'count']).round(1)
                print("\n📊 TOKEN USAGE BY PROVIDER:")
                for provider in provider_token_usage.index:
                    total = provider_token_usage.loc[provider, 'sum']
                    avg = provider_token_usage.loc[provider, 'mean']
                    count = provider_token_usage.loc[provider, 'count']
                    print(f"  {provider}: {total:,} total tokens ({avg:.1f} avg) across {count:,} questions")

            # Cost bias analysis
            print("\n💸 COST BIAS EFFECTIVENESS:")
            successful_df['cost_bias_bin'] = pd.cut(successful_df['cost_bias_used'], bins=[0, 0.3, 0.7, 1.0], labels=['Low (0-0.3)', 'Med (0.3-0.7)', 'High (0.7-1.0)'])
            if len(successful_df) > 1:
                cost_bias_analysis = successful_df.groupby('cost_bias_bin').agg({
                    'selected_model': lambda x: x.mode().iloc[0] if not x.empty else 'unknown',
                    'selected_provider': lambda x: x.mode().iloc[0] if not x.empty else 'unknown',
                    'is_correct': 'mean',
                    'total_tokens': 'mean'
                }).round(3)
                print(cost_bias_analysis)

        # Save results
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"mmlu_math_benchmark_{timestamp}.csv"
        df.to_csv(filename, index=False)

        print(f"\n💾 Results saved: {filename}")

        print("\n🔄 ADAPTIVE ROUTING EFFECTIVENESS:")
        print(f"  Total questions tested: {len(df):,}")
        print(f"  Subject: Mathematics (High School Level)")
        if len(successful_df) > 0:
            print(f"  Unique model/provider combinations: {len(model_provider_usage)}")
            print(f"  Most used combination: {model_provider_usage.index[0] if len(model_provider_usage) > 0 else 'N/A'}")
        
        print(f"\n🎯 MATHEMATICS BENCHMARK INSIGHTS:")
        print(f"  🧮 Math reasoning accuracy: {overall_accuracy:.1f}%")
        print(f"  🚀 Questions per minute: {len(df) / total_time.total_seconds() * 60:.1f}")
        print(f"  📊 Success rate: {successful_requests/len(df)*100:.1f}%")
        
        # Performance categorization
        if overall_accuracy >= 70:
            print(f"  🏆 Performance: Excellent mathematical reasoning!")
        elif overall_accuracy >= 50:
            print(f"  👍 Performance: Good mathematical capabilities")
        else:
            print(f"  📈 Performance: Needs improvement on math problems")

    print("\n✨ MATHEMATICS BENCHMARK FINISHED!")
    print(f"🏁 Total time: {total_time}")
    print(f"📊 Math accuracy: {overall_accuracy:.1f}%")
    print("🚀 Perfect for quick validation of reasoning capabilities!")
    return df

# Run the single subject benchmark
print("🧮 MMLU MATHEMATICS BENCHMARK")
print("🎯 Focus on the most important subject for AI reasoning")
print("⚡ Fast, focused, and comprehensive analysis\n")

# Set random seed for reproducible results
random.seed(42)

# Run it!
math_results = run_single_subject_mmlu_benchmark()

print("\n🎉 MATHEMATICS BENCHMARK COMPLETE!")

🧮 MMLU MATHEMATICS BENCHMARK
🎯 Focus on the most important subject for AI reasoning
⚡ Fast, focused, and comprehensive analysis

=== SINGLE SUBJECT MMLU BENCHMARK ===
🎯 Testing ONLY: high_school_mathematics
📊 Math is the gold standard for AI reasoning capabilities
⏰ Estimated time: 5-15 minutes
💰 Cost: Very low API usage

▶️  Starting in 2 seconds...
📚 Loading MMLU dataset...
✅ Loaded 14042 total test questions
🎯 Found 270 questions for high_school_mathematics

🔬 Testing subject: high_school_mathematics
   📊 Processing 270 questions...
   Question 1/270...    Question 10/270... ✅ Accuracy: 30.0%
   Question 20/270... ✅ Accuracy: 40.0%
   Question 30/270... ✅ Accuracy: 43.3%
   Question 40/270... ✅ Accuracy: 47.5%
   Question 50/270... ✅ Accuracy: 42.0%
   Question 60/270... ✅ Accuracy: 45.0%
   Question 70/270... ✅ Accuracy: 42.9%
   Question 80/270... ✅ Accuracy: 41.2%
   Question 90/270... ✅ Accuracy: 38.9%
   Question 100/270... ✅ Accuracy: 39.0%
   Question 110/270... ✅ Accuracy: 3

KeyboardInterrupt: 

In [28]:
import requests
import time
BASE_URL = "https://backend-dev.mangoplant-a7a21605.swedencentral.azurecontainerapps.io"
PYTHON_SERVICE_URL = "https://prompt-classifer-dev.mangoplant-a7a21605.swedencentral.azurecontainerapps.io"

def test_service(url, endpoint, data, service_name):
    """Test a service and return the result"""
    full_url = f"{url.rstrip('/')}/{endpoint.lstrip('/')}"
    headers = {"Content-Type": "application/json"}

    print(f"\n🔍 Testing {service_name}: {endpoint}")
    start_time = time.time()

    try:
        response = requests.post(full_url, headers=headers, json=data, timeout=15)
        response_time = time.time() - start_time

        print(f"✅ Status: {response.status_code} | Time: {response_time:.2f}s")
        result = response.json()

        # Extract key info
        if service_name == "Python AI Service":
            protocol = result.get("protocol", "unknown")
            model_info = result.get("minion", {}).get("model") or result.get("standard", {}).get("model")
            provider_info = result.get("minion", {}).get("provider") or result.get("standard", {}).get("provider")
            print(f"📄 Protocol: {protocol} | Model: {provider_info}/{model_info}")
        else:  # Go Backend
            model = result.get("model", "unknown")
            provider = result.get("provider", "unknown")
            print(f"📄 Final Selection: {provider}/{model}")

        return result

    except Exception as e:
        print(f"❌ Error: {str(e)}")
        return None

# Test prompt
test_prompt = "Explain quantum computing in simple terms."

print("=== MODEL SELECTION COMPARISON TEST ===")
print(f"🧪 Test Prompt: '{test_prompt}'")

# Test Python AI Service directly
python_data = {
    "messages": [{"role": "user", "content": test_prompt}],
    "provider_constraint": ["openai", "deepseek"],
    "cost_bias": 0.8
}

python_result = test_service(PYTHON_SERVICE_URL, "/predict", python_data, "Python AI Service")

# Test Go Backend (which should call Python service internally)
go_data = {
    "messages": [{"role": "user", "content": test_prompt}],
    "provider_constraint": ["openai", "deepseek"],
    "cost_bias": 0.8
}

go_result = test_service(BASE_URL, "/v1/chat/completions", go_data, "Go Backend")

# Compare results
print(f"\n{'='*50}")
print("🔍 COMPARISON ANALYSIS:")

if python_result or go_result:
    # Extract models for comparison
    python_model = None
    go_model = go_result.get("model")

    if python_result.get("protocol") == "minion":
        python_model = f"{python_result.get('minion', {}).get('provider')}/{python_result.get('minion', {}).get('model')}"
    elif python_result.get("protocol") == "standard":
        python_model = f"{python_result.get('standard', {}).get('provider')}/{python_result.get('standard', {}).get('model')}"

    go_model_full = f"{go_result.get('provider')}/{go_result.get('model')}"

    print(f"🐍 Python Service Suggests: {python_model}")
    print(f"🔧 Go Backend Actually Used: {go_model_full}")

    if python_model and python_model.lower() in go_model_full.lower():
        print("✅ MATCH: Go backend used Python service recommendation!")
    else:
        print("⚠️  DIFFERENT: Go backend used different model (fallback or override)")
else:
    print("❌ Cannot compare - one or both services failed")

print("\n✨ Comparison test complete!")


=== MODEL SELECTION COMPARISON TEST ===
🧪 Test Prompt: 'Explain quantum computing in simple terms.'

🔍 Testing Python AI Service: /predict
✅ Status: 200 | Time: 0.41s
📄 Protocol: minion | Model: groq/llama-3.1-8b-instant

🔍 Testing Go Backend: /v1/chat/completions
✅ Status: 200 | Time: 1.68s
📄 Final Selection: groq/llama-3.1-8b-instant

🔍 COMPARISON ANALYSIS:
🐍 Python Service Suggests: groq/llama-3.1-8b-instant
🔧 Go Backend Actually Used: groq/llama-3.1-8b-instant
✅ MATCH: Go backend used Python service recommendation!

✨ Comparison test complete!
