# Agent Evaluation and Testing
## Performance Assessment and Quality Assurance - Notebook 6

**Objective**: Systematically evaluate and test the performance of all agents in the economic forecasting multi-agent system.

### What You'll Learn:
- Agent performance metrics and evaluation
- Forecast accuracy testing and validation
- Tool usage efficiency analysis
- Session management evaluation
- Performance benchmarking and optimization

## 1. Setup and Evaluation Framework

In [None]:
# Import required libraries
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import asyncio
import warnings
from datetime import datetime, timedelta
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import json
warnings.filterwarnings('ignore')

# Add src to path
sys.path.append('../src')

# Import evaluation components
from agents.team_coordinator import EconomicTeamCoordinator
from tools.statistical_tools import StatisticalTools
from google.adk.models.google_llm import Gemini
from google.genai import types
from google.adk.sessions import InMemorySessionService

# Setup visualization
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("‚úÖ Evaluation framework components imported successfully")

In [None]:
# Initialize evaluation system
print("üöÄ Initializing Agent Evaluation System...")

# Initialize components
stat_tools = StatisticalTools()

# Initialize model
retry_config = types.HttpRetryOptions(
    attempts=5,
    exp_base=7,
    initial_delay=1,
    http_status_codes=[429, 500, 503, 504],
)

model = Gemini(
    model="gemini-2.0-flash-exp",
    retry_options=retry_config
)

# Initialize team coordinator
bea_api_key = os.getenv('BEA_API_KEY')
team_coordinator = EconomicTeamCoordinator(bea_api_key, model)

print("ü§ñ Evaluation system initialized with all agents")
print("üìä Ready to perform comprehensive agent evaluation")

## 2. Test Data Generation

In [None]:
# Generate comprehensive test datasets
def generate_test_datasets():
    """Generate multiple test datasets for agent evaluation"""

    # Base dates
    dates = pd.date_range(start='2010-01-01', end='2024-12-31', freq='Q')

    test_datasets = {}

    # Dataset 1: Stable growth scenario
    np.random.seed(42)
    stable_trend = np.linspace(100, 200, len(dates))
    stable_seasonal = 5 * np.sin(2 * np.pi * np.arange(len(dates)) / 4)
    stable_noise = np.random.normal(0, 2, len(dates))
    test_datasets['stable_growth'] = pd.DataFrame({
        'TimePeriod': dates,
        'DataValue': stable_trend + stable_seasonal + stable_noise,
        'Scenario': 'Stable Growth'
    })

    # Dataset 2: Volatile scenario
    np.random.seed(43)
    volatile_trend = np.linspace(100, 180, len(dates))
    volatile_cycle = 20 * np.sin(2 * np.pi * np.arange(len(dates)) / 8)
    volatile_noise = np.random.normal(0, 8, len(dates))
    test_datasets['volatile'] = pd.DataFrame({
        'TimePeriod': dates,
        'DataValue': volatile_trend + volatile_cycle + volatile_noise,
        'Scenario': 'High Volatility'
    })

    # Dataset 3: Recession scenario
    np.random.seed(44)
    recession_trend = np.linspace(150, 250, len(dates))
    # Add recession around midpoint
    recession_depth = 30
    recession_start = len(dates) // 3
    recession_end = recession_start + 8
    recession_effect = np.zeros(len(dates))
    for i in range(len(dates)):
        if recession_start <= i < recession_end:
            recession_effect[i] = -recession_depth * (1 - (i - recession_start) / (recession_end - recession_start))
    recession_noise = np.random.normal(0, 3, len(dates))
    test_datasets['recession'] = pd.DataFrame({
        'TimePeriod': dates,
        'DataValue': recession_trend + recession_effect + recession_noise,
        'Scenario': 'Recession Scenario'
    })

    # Dataset 4: Structural break scenario
    np.random.seed(45)
    break_trend1 = np.linspace(100, 150, len(dates)//2)
    break_trend2 = np.linspace(120, 220, len(dates) - len(dates)//2)
    structural_trend = np.concatenate([break_trend1, break_trend2])
    structural_noise = np.random.normal(0, 4, len(dates))
    test_datasets['structural_break'] = pd.DataFrame({
        'TimePeriod': dates,
        'DataValue': structural_trend + structural_noise,
        'Scenario': 'Structural Break'
    })

    return test_datasets

# Generate test datasets
test_datasets = generate_test_datasets()

print("üìä Test Datasets Generated:")
for scenario, data in test_datasets.items():
    print(f"   ‚Ä¢ {scenario}: {len(data)} quarters, {data['Scenario'].iloc[0]}")

# Display sample of each dataset
print("\nüîç Sample Data from Each Scenario:")
for scenario, data in test_datasets.items():
    print(f"\n{scenario}:")
    print(data[['TimePeriod', 'DataValue']].head(3).to_string(index=False))

## 3. Data Collector Agent Evaluation

In [None]:
# Evaluate Data Collector Agent
print("üì• Evaluating Data Collector Agent...")
print("=" * 50)

async def evaluate_data_collector():
    """Comprehensive evaluation of Data Collector Agent"""

    evaluation_results = {}

    # Test 1: GDP Data Collection
    print("\nüß™ Test 1: GDP Data Collection")
    gdp_result = await team_coordinator.data_collector.get_gdp_data()

    evaluation_results['gdp_collection'] = {
        'status': gdp_result['status'],
        'data_points': len(gdp_result.get('data', [])),
        'success': gdp_result['status'] == 'success',
        'message': gdp_result.get('message', 'No message')
    }

    print(f"   Status: {gdp_result['status']}")
    print(f"   Data Points: {len(gdp_result.get('data', []))}")
    print(f"   Message: {gdp_result.get('message', 'No message')}")

    # Test 2: Unemployment Data Collection
    print("\nüß™ Test 2: Unemployment Data Collection")
    unemployment_result = await team_coordinator.data_collector.get_unemployment_data()

    evaluation_results['unemployment_collection'] = {
        'status': unemployment_result['status'],
        'data_points': len(unemployment_result.get('data', [])),
        'success': unemployment_result['status'] == 'success',
        'message': unemployment_result.get('message', 'No message')
    }

    print(f"   Status: {unemployment_result['status']}")
    print(f"   Data Points: {len(unemployment_result.get('data', []))}")
    print(f"   Message: {unemployment_result.get('message', 'No message')}")

    # Test 3: Inflation Data Collection
    print("\nüß™ Test 3: Inflation Data Collection")
    inflation_result = await team_coordinator.data_collector.get_inflation_data()

    evaluation_results['inflation_collection'] = {
        'status': inflation_result['status'],
        'data_points': len(inflation_result.get('data', [])),
        'success': inflation_result['status'] == 'success',
        'message': inflation_result.get('message', 'No message')
    }

    print(f"   Status: {inflation_result['status']}")
    print(f"   Data Points: {len(inflation_result.get('data', []))}")
    print(f"   Message: {inflation_result.get('message', 'No message')}")

    # Calculate overall performance
    successful_tests = sum(1 for test in evaluation_results.values() if test['success'])
    total_tests = len(evaluation_results)
    success_rate = (successful_tests / total_tests) * 100

    evaluation_results['summary'] = {
        'successful_tests': successful_tests,
        'total_tests': total_tests,
        'success_rate': success_rate,
        'agent': 'Data Collector'
    }

    print(f"\nüìä Data Collector Agent Summary:")
    print(f"   Successful Tests: {successful_tests}/{total_tests}")
    print(f"   Success Rate: {success_rate:.1f}%")

    return evaluation_results

# Run data collector evaluation
data_collector_results = await evaluate_data_collector()

## 4. Economic Analyst Agent Evaluation

In [None]:
# Evaluate Economic Analyst Agent
print("üìä Evaluating Economic Analyst Agent...")
print("=" * 50)

async def evaluate_economic_analyst():
    """Comprehensive evaluation of Economic Analyst Agent"""

    evaluation_results = {}

    # Test each scenario
    for scenario_name, test_data in test_datasets.items():
        print(f"\nüß™ Testing Scenario: {scenario_name}")

        data_dict = test_data[['TimePeriod', 'DataValue']].to_dict('records')

        # Test 1: Growth Trend Analysis
        growth_result = await team_coordinator.economic_analyst.analyze_growth_trends(data_dict)

        # Test 2: Economic Indicators
        indicator_result = await team_coordinator.economic_analyst.calculate_economic_indicators(data_dict)

        # Test 3: Business Cycle Analysis
        cycle_result = await team_coordinator.economic_analyst.identify_business_cycles(data_dict)

        # Test 4: Anomaly Detection
        anomaly_result = await team_coordinator.economic_analyst.detect_anomalies(data_dict)

        # Store results
        evaluation_results[scenario_name] = {
            'growth_analysis': growth_result['status'] == 'success',
            'indicators': indicator_result['status'] == 'success',
            'business_cycles': cycle_result['status'] == 'success',
            'anomaly_detection': anomaly_result['status'] == 'success'
        }

        # Print scenario results
        successful_tests = sum(evaluation_results[scenario_name].values())
        total_tests = len(evaluation_results[scenario_name])

        print(f"   Successful: {successful_tests}/{total_tests} tests")
        print(f"   Growth Analysis: {'‚úÖ' if growth_result['status'] == 'success' else '‚ùå'}")
        print(f"   Indicators: {'‚úÖ' if indicator_result['status'] == 'success' else '‚ùå'}")
        print(f"   Business Cycles: {'‚úÖ' if cycle_result['status'] == 'success' else '‚ùå'}")
        print(f"   Anomaly Detection: {'‚úÖ' if anomaly_result['status'] == 'success' else '‚ùå'}")

    # Calculate overall performance
    all_tests = []
    for scenario_results in evaluation_results.values():
        all_tests.extend(scenario_results.values())

    successful_total = sum(all_tests)
    total_tests = len(all_tests)
    success_rate = (successful_total / total_tests) * 100

    evaluation_results['summary'] = {
        'successful_tests': successful_total,
        'total_tests': total_tests,
        'success_rate': success_rate,
        'agent': 'Economic Analyst'
    }

    print(f"\nüìä Economic Analyst Agent Summary:")
    print(f"   Successful Tests: {successful_total}/{total_tests}")
    print(f"   Success Rate: {success_rate:.1f}%")
    print(f"   Scenarios Tested: {len(test_datasets)}")

    return evaluation_results

# Run economic analyst evaluation
economic_analyst_results = await evaluate_economic_analyst()

## 5. Forecasting Specialist Agent Evaluation

In [None]:
# Evaluate Forecasting Specialist Agent
print("üîÆ Evaluating Forecasting Specialist Agent...")
print("=" * 50)

async def evaluate_forecasting_specialist():
    """Comprehensive evaluation of Forecasting Specialist Agent"""

    evaluation_results = {}
    accuracy_metrics = {}

    for scenario_name, test_data in test_datasets.items():
        print(f"\nüß™ Testing Scenario: {scenario_name}")

        # Split data for train/test
        split_point = int(len(test_data) * 0.7)
        train_data = test_data.iloc[:split_point]
        test_data_actual = test_data.iloc[split_point:]

        train_dict = train_data[['TimePeriod', 'DataValue']].to_dict('records')

        # Test 1: GDP Forecasting
        forecast_result = await team_coordinator.forecasting_specialist.forecast_gdp(
            train_dict,
            horizon=len(test_data_actual)
        )

        # Test 2: ARIMA Model Building
        arima_result = await team_coordinator.forecasting_specialist.build_arima_model(train_dict)

        # Test 3: Ensemble Forecasting
        ensemble_result = await team_coordinator.forecasting_specialist.generate_ensemble_forecast(train_dict)

        # Calculate accuracy if forecasts were successful
        if forecast_result['status'] == 'success':
            actual_values = test_data_actual['DataValue'].values
            predicted_values = [f['point_forecast'] for f in forecast_result.get('forecasts', [])]

            if len(predicted_values) == len(actual_values):
                mae = mean_absolute_error(actual_values, predicted_values)
                rmse = np.sqrt(mean_squared_error(actual_values, predicted_values))
                mape = np.mean(np.abs((actual_values - predicted_values) / actual_values)) * 100

                accuracy_metrics[scenario_name] = {
                    'mae': mae,
                    'rmse': rmse,
                    'mape': mape
                }

        # Store results
        evaluation_results[scenario_name] = {
            'gdp_forecasting': forecast_result['status'] == 'success',
            'arima_modeling': arima_result['status'] == 'success',
            'ensemble_forecasting': ensemble_result['status'] == 'success',
            'accuracy_available': forecast_result['status'] == 'success' and len(predicted_values) == len(actual_values)
        }

        # Print scenario results
        successful_tests = sum(evaluation_results[scenario_name].values())
        total_tests = len(evaluation_results[scenario_name])

        print(f"   Successful: {successful_tests}/{total_tests} tests")
        print(f"   GDP Forecasting: {'‚úÖ' if forecast_result['status'] == 'success' else '‚ùå'}")
        print(f"   ARIMA Modeling: {'‚úÖ' if arima_result['status'] == 'success' else '‚ùå'}")
        print(f"   Ensemble Forecasting: {'‚úÖ' if ensemble_result['status'] == 'success' else '‚ùå'}")

        if scenario_name in accuracy_metrics:
            print(f"   Forecast Accuracy: MAE={accuracy_metrics[scenario_name]['mae']:.2f}, "
                  f"RMSE={accuracy_metrics[scenario_name]['rmse']:.2f}")

    # Calculate overall performance
    all_tests = []
    for scenario_results in evaluation_results.values():
        all_tests.extend(scenario_results.values())

    successful_total = sum(all_tests)
    total_tests = len(all_tests)
    success_rate = (successful_total / total_tests) * 100

    evaluation_results['summary'] = {
        'successful_tests': successful_total,
        'total_tests': total_tests,
        'success_rate': success_rate,
        'agent': 'Forecasting Specialist',
        'accuracy_metrics': accuracy_metrics
    }

    print(f"\nüìä Forecasting Specialist Agent Summary:")
    print(f"   Successful Tests: {successful_total}/{total_tests}")
    print(f"   Success Rate: {success_rate:.1f}%")
    print(f"   Scenarios Tested: {len(test_datasets)}")

    return evaluation_results

# Run forecasting specialist evaluation
forecasting_results = await evaluate_forecasting_specialist()

## 6. Visualization Agent Evaluation

In [None]:
# Evaluate Visualization Agent
print("üìà Evaluating Visualization Agent...")
print("=" * 50)

async def evaluate_visualization_agent():
    """Comprehensive evaluation of Visualization Agent"""

    evaluation_results = {}

    # Use stable growth scenario for visualization tests
    test_data = test_datasets['stable_growth']
    data_dict = test_data[['TimePeriod', 'DataValue']].to_dict('records')

    print("üß™ Running Visualization Tests...")

    # Test 1: Growth Chart Creation
    print("\nüî∏ Test 1: Growth Chart Creation")
    growth_chart_result = await team_coordinator.visualization_agent.create_growth_chart(data_dict)
    evaluation_results['growth_chart'] = growth_chart_result['status'] == 'success'
    print(f"   Status: {'‚úÖ Success' if growth_chart_result['status'] == 'success' else '‚ùå Failed'}")

    # Test 2: Forecast Visualization
    print("\nüî∏ Test 2: Forecast Visualization")

    # Create sample forecast data
    sample_forecast = {
        'forecasts': [
            {'point_forecast': 210, 'confidence_lower': 205, 'confidence_upper': 215},
            {'point_forecast': 212, 'confidence_lower': 207, 'confidence_upper': 217},
            {'point_forecast': 215, 'confidence_lower': 210, 'confidence_upper': 220}
        ]
    }

    forecast_viz_result = await team_coordinator.visualization_agent.plot_forecasts(data_dict, sample_forecast)
    evaluation_results['forecast_visualization'] = forecast_viz_result['status'] == 'success'
    print(f"   Status: {'‚úÖ Success' if forecast_viz_result['status'] == 'success' else '‚ùå Failed'}")

    # Test 3: Comprehensive Dashboard
    print("\nüî∏ Test 3: Comprehensive Dashboard")

    sample_analysis = {
        'trend': 'upward',
        'confidence': 0.85,
        'current_growth_rate': 2.5
    }

    dashboard_result = await team_coordinator.visualization_agent.create_economic_dashboard(
        data_dict, sample_analysis, sample_forecast
    )
    evaluation_results['dashboard_creation'] = dashboard_result['status'] == 'success'
    print(f"   Status: {'‚úÖ Success' if dashboard_result['status'] == 'success' else '‚ùå Failed'}")

    # Test 4: Export Functionality
    print("\nüî∏ Test 4: Export Functionality")
    export_result = await team_coordinator.visualization_agent.export_visualization("test_dashboard", "html")
    evaluation_results['export_functionality'] = export_result['status'] == 'success'
    print(f"   Status: {'‚úÖ Success' if export_result['status'] == 'success' else '‚ùå Failed'}")

    # Calculate overall performance
    successful_tests = sum(evaluation_results.values())
    total_tests = len(evaluation_results)
    success_rate = (successful_tests / total_tests) * 100

    evaluation_results['summary'] = {
        'successful_tests': successful_tests,
        'total_tests': total_tests,
        'success_rate': success_rate,
        'agent': 'Visualization Agent'
    }

    print(f"\nüìä Visualization Agent Summary:")
    print(f"   Successful Tests: {successful_tests}/{total_tests}")
    print(f"   Success Rate: {success_rate:.1f}%")

    return evaluation_results

# Run visualization agent evaluation
visualization_results = await evaluate_visualization_agent()

## 7. Comprehensive Performance Analysis

In [None]:
# Create comprehensive performance dashboard
print("üìä Creating Comprehensive Performance Dashboard...")

# Collect all evaluation results
all_results = {
    'Data Collector': data_collector_results['summary'],
    'Economic Analyst': economic_analyst_results['summary'],
    'Forecasting Specialist': forecasting_results['summary'],
    'Visualization Agent': visualization_results['summary']
}

# Create performance comparison
performance_data = []
for agent, results in all_results.items():
    performance_data.append({
        'Agent': agent,
        'Success Rate (%)': results['success_rate'],
        'Successful Tests': results['successful_tests'],
        'Total Tests': results['total_tests']
    })

performance_df = pd.DataFrame(performance_data)

print("\nüèÜ AGENT PERFORMANCE COMPARISON")
print("=" * 50)
print(performance_df.to_string(index=False))

# Create performance visualization
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=(
        'Agent Success Rates',
        'Test Completion Overview',
        'Forecasting Accuracy by Scenario',
        'Performance Distribution'
    ),
    specs=[[{'type': 'bar'}, {'type': 'bar'}], [{'type': 'bar'}, {'type': 'box'}]]
)

# Plot 1: Success Rates
fig.add_trace(
    go.Bar(x=performance_df['Agent'], y=performance_df['Success Rate (%)'],
           name='Success Rate', marker_color='#1f77b4'),
    row=1, col=1
)

# Plot 2: Test Completion
fig.add_trace(
    go.Bar(x=performance_df['Agent'], y=performance_df['Successful Tests'],
           name='Successful Tests', marker_color='#2ca02c'),
    row=1, col=2
)

# Plot 3: Forecasting Accuracy (if available)
if 'accuracy_metrics' in forecasting_results['summary']:
    accuracy_data = forecasting_results['summary']['accuracy_metrics']
    scenarios = list(accuracy_data.keys())
    mae_values = [accuracy_data[scenario]['mae'] for scenario in scenarios]

    fig.add_trace(
        go.Bar(x=scenarios, y=mae_values, name='MAE by Scenario', marker_color='#ff7f0e'),
        row=2, col=1
    )

# Plot 4: Performance Distribution
success_rates = performance_df['Success Rate (%)'].tolist()
fig.add_trace(
    go.Box(y=success_rates, name='Success Rate Distribution', marker_color='#9467bd'),
    row=2, col=2
)

fig.update_layout(
    title_text="Multi-Agent System Performance Dashboard",
    height=600,
    showlegend=False
)

fig.show()

## 8. Summary and Recommendations

In [None]:
print("üéØ EVALUATION SUMMARY AND RECOMMENDATIONS")
print("=" * 60)

# Calculate overall system performance
total_successful = sum(result['summary']['successful_tests'] for result in [
    data_collector_results, economic_analyst_results, forecasting_results, visualization_results
])
total_tests = sum(result['summary']['total_tests'] for result in [
    data_collector_results, economic_analyst_results, forecasting_results, visualization_results
])
overall_success_rate = (total_successful / total_tests) * 100

print(f"üìä OVERALL SYSTEM PERFORMANCE:")
print(f"   Total Tests: {total_tests}")
print(f"   Successful Tests: {total_successful}")
print(f"   Overall Success Rate: {overall_success_rate:.1f}%")

print(f"\nüèÜ AGENT PERFORMANCE RANKING:")
ranked_agents = sorted(performance_df.to_dict('records'), key=lambda x: x['Success Rate (%)'], reverse=True)
for i, agent in enumerate(ranked_agents, 1):
    print(f"   {i}. {agent['Agent']}: {agent['Success Rate (%)']:.1f}%")

print(f"\nüîç KEY FINDINGS:")

# Data Collector Findings
dc_success = data_collector_results['summary']['success_rate']
if dc_success < 80:
    print(f"   ‚Ä¢ Data Collector: API integration needs improvement ({dc_success:.1f}%)")
else:
    print(f"   ‚Ä¢ Data Collector: Strong performance ({dc_success:.1f}%)")

# Economic Analyst Findings
ea_success = economic_analyst_results['summary']['success_rate']
if ea_success < 85:
    print(f"   ‚Ä¢ Economic Analyst: Analysis consistency needs work ({ea_success:.1f}%)")
else:
    print(f"   ‚Ä¢ Economic Analyst: Reliable analysis ({ea_success:.1f}%)")

# Forecasting Specialist Findings
fs_success = forecasting_results['summary']['success_rate']
if 'accuracy_metrics' in forecasting_results['summary']:
    avg_mae = np.mean([metrics['mae'] for metrics in forecasting_results['summary']['accuracy_metrics'].values()])
    print(f"   ‚Ä¢ Forecasting Specialist: Good success ({fs_success:.1f}%), Average MAE: {avg_mae:.2f}")
else:
    print(f"   ‚Ä¢ Forecasting Specialist: Moderate success ({fs_success:.1f}%)")

# Visualization Agent Findings
va_success = visualization_results['summary']['success_rate']
if va_success < 90:
    print(f"   ‚Ä¢ Visualization Agent: Dashboard creation needs optimization ({va_success:.1f}%)")
else:
    print(f"   ‚Ä¢ Visualization Agent: Excellent performance ({va_success:.1f}%)")

print(f"\nüí° RECOMMENDATIONS FOR IMPROVEMENT:")

if overall_success_rate < 80:
    print("   1. Focus on agent reliability and error handling")
    print("   2. Implement better retry mechanisms for API calls")
    print("   3. Add comprehensive logging and monitoring")
else:
    print("   1. Maintain current reliability standards")
    print("   2. Focus on performance optimization")
    print("   3. Expand test coverage for edge cases")

print(f"\nüîß TECHNICAL RECOMMENDATIONS:")
print("   1. Implement automated testing pipeline")
print("   2. Add performance benchmarking")
print("   3. Create alerting for performance degradation")
print("   4. Establish regular evaluation schedule")

print(f"\nüöÄ PRODUCTION READINESS ASSESSMENT:")
if overall_success_rate >= 90:
    print("   ‚úÖ EXCELLENT - Ready for production deployment")
elif overall_success_rate >= 80:
    print("   ‚úÖ GOOD - Suitable for production with monitoring")
elif overall_success_rate >= 70:
    print("   ‚ö†Ô∏è  FAIR - Needs improvement before production")
else:
    print("   ‚ùå POOR - Significant work needed before production")

print(f"\nüìà NEXT STEPS FOR EVALUATION:")
print("   1. Implement continuous integration testing")
print("   2. Set up performance monitoring dashboard")
print("   3. Establish quality gates for deployment")
print("   4. Create user acceptance testing framework")

print(f"\nüéØ EVALUATION COMPLETE!")
print("   The multi-agent system has been thoroughly evaluated and is ready for the next phase.")