# Model Evaluation and Performance Analysis

This notebook provides comprehensive evaluation of trained heart disease classification models including:
- Performance metrics calculation
- ROC curves and AUC analysis
- Confusion matrices
- Cross-validation assessment
- Model comparison and recommendations

In [None]:
# Import required libraries
import sys
import os
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

from model_evaluator import ModelEvaluator

# Set up plotting
plt.style.use('default')
sns.set_palette("husl")
%matplotlib inline

print("Libraries imported successfully!")

## 1. Initialize Model Evaluator and Load Data

In [None]:
# Initialize the model evaluator
evaluator = ModelEvaluator(random_state=42)

# Load trained models and test data
evaluator.load_models_and_data(
    models_dir='../models/supervised',
    data_path='../data/processed/heart_disease_selected.csv'
)

print(f"\nLoaded {len(evaluator.models)} models:")
for model_name in evaluator.models.keys():
    print(f"  - {model_name.replace('_', ' ').title()}")

print(f"\nTest set size: {len(evaluator.y_test)} samples")
print(f"Class distribution: {dict(pd.Series(evaluator.y_test).value_counts().sort_index())}")

## 2. Calculate Classification Metrics

In [None]:
# Calculate comprehensive metrics for all models
print("Classification Metrics for All Models")
print("=" * 50)

all_metrics = {}
for model_name, model in evaluator.models.items():
    metrics = evaluator.calculate_classification_metrics(model_name, model)
    all_metrics[model_name] = metrics
    
    print(f"\n{model_name.replace('_', ' ').title()}:")
    print(f"  Accuracy:  {metrics['accuracy']:.4f}")
    print(f"  Precision: {metrics['precision']:.4f}")
    print(f"  Recall:    {metrics['recall']:.4f}")
    print(f"  F1-Score:  {metrics['f1_score']:.4f}")
    print(f"  ROC-AUC:   {metrics['roc_auc']:.4f}")
    print(f"  Specificity: {metrics['specificity']:.4f}")

## 3. Model Performance Comparison

In [None]:
# Create performance comparison table
comparison_df = evaluator.compare_model_performance()
print("Model Performance Comparison")
print("=" * 30)
print(comparison_df.to_string(index=False))

## 4. ROC Curves Analysis

In [None]:
# Generate ROC curves
roc_data = evaluator.generate_roc_curves(save_dir='../results/model_evaluation/plots')

# Display ROC curve data
print("ROC-AUC Scores:")
print("-" * 20)
for model_name, data in roc_data.items():
    print(f"{model_name.replace('_', ' ').title()}: {data['auc']:.4f}")

# Load and display the ROC curves plot
from IPython.display import Image, display
display(Image('../results/model_evaluation/plots/roc_curves_comparison.png'))

## 5. Confusion Matrices

In [None]:
# Generate confusion matrices
confusion_matrices = evaluator.plot_confusion_matrices(save_dir='../results/model_evaluation/plots')

# Display confusion matrix data
print("Confusion Matrices:")
print("-" * 20)
for model_name, cm in confusion_matrices.items():
    print(f"\n{model_name.replace('_', ' ').title()}:")
    print(f"  True Negatives:  {cm[0,0]}")
    print(f"  False Positives: {cm[0,1]}")
    print(f"  False Negatives: {cm[1,0]}")
    print(f"  True Positives:  {cm[1,1]}")

# Display the confusion matrices plot
display(Image('../results/model_evaluation/plots/confusion_matrices.png'))

## 6. Cross-Validation Assessment

In [None]:
# Perform cross-validation
cv_results = evaluator.cross_validation_scores(cv_folds=5)

# Display cross-validation results
print("5-Fold Cross-Validation Results")
print("=" * 35)

cv_summary = []
for model_name, results in cv_results.items():
    cv_summary.append({
        'Model': model_name.replace('_', ' ').title(),
        'CV Accuracy': f"{results['accuracy']['mean']:.4f} ± {results['accuracy']['std']:.4f}",
        'CV Precision': f"{results['precision']['mean']:.4f} ± {results['precision']['std']:.4f}",
        'CV Recall': f"{results['recall']['mean']:.4f} ± {results['recall']['std']:.4f}",
        'CV F1-Score': f"{results['f1']['mean']:.4f} ± {results['f1']['std']:.4f}"
    })

cv_df = pd.DataFrame(cv_summary)
print(cv_df.to_string(index=False))

## 7. Precision-Recall Curves

In [None]:
# Generate precision-recall curves
pr_data = evaluator.plot_precision_recall_curves(save_dir='../results/model_evaluation/plots')

# Display average precision scores
print("Average Precision Scores:")
print("-" * 25)
for model_name, data in pr_data.items():
    print(f"{model_name.replace('_', ' ').title()}: {data['average_precision']:.4f}")

# Display the precision-recall curves plot
display(Image('../results/model_evaluation/plots/precision_recall_curves.png'))

## 8. Model Comparison Visualizations

In [None]:
# Create model comparison plots
evaluator.plot_model_comparison_metrics(save_dir='../results/model_evaluation/plots')

# Display the comparison plots
print("Model Performance Metrics Comparison:")
display(Image('../results/model_evaluation/plots/metrics_comparison.png'))

print("\nROC-AUC Score Comparison:")
display(Image('../results/model_evaluation/plots/roc_auc_comparison.png'))

## 9. Classification Reports

In [None]:
# Generate detailed classification reports
classification_reports = evaluator.generate_classification_report(save_dir='../results/model_evaluation')

# Display classification reports
for model_name, report in classification_reports.items():
    print(f"\nClassification Report - {model_name.replace('_', ' ').title()}")
    print("=" * 50)
    
    # Create a formatted DataFrame from the report
    report_df = pd.DataFrame(report).transpose()
    
    # Display the main classification metrics
    print(report_df.loc[['No Disease', 'Disease', 'macro avg', 'weighted avg']].round(4))

## 10. Comprehensive Performance Summary

In [None]:
# Generate comprehensive performance summary
summary = evaluator.model_performance_summary(save_dir='../results/model_evaluation')

print("Comprehensive Performance Summary")
print("=" * 40)

print(f"\nDataset Information:")
print(f"  Test Samples: {summary['dataset_info']['test_samples']}")
print(f"  Positive Class Ratio: {summary['dataset_info']['positive_class_ratio']:.3f}")
print(f"  Class Distribution: {summary['dataset_info']['class_distribution']}")

print(f"\nBest Performing Models:")
for metric, info in summary['best_performing_models'].items():
    print(f"  {metric.upper()}: {info['model'].replace('_', ' ').title()} ({info['score']:.4f})")

print(f"\nOverall Best Model: {summary['recommendations']['overall_best_model']['model'].replace('_', ' ').title()}")
print(f"Overall Score: {summary['recommendations']['overall_best_model']['score']:.4f}")

print(f"\nUse Case Recommendations:")
for use_case, rec in summary['recommendations']['use_case_recommendations'].items():
    print(f"  {use_case.replace('_', ' ').title()}: {rec['model'].replace('_', ' ').title()}")
    print(f"    Reason: {rec['reason']}")

print(f"\nPerformance Insights:")
for insight in summary['recommendations']['performance_insights']:
    print(f"  • {insight}")

if summary['recommendations']['improvement_suggestions']:
    print(f"\nImprovement Suggestions:")
    for suggestion in summary['recommendations']['improvement_suggestions']:
        print(f"  • {suggestion}")

## 11. Save All Results

In [None]:
# Save all evaluation results
evaluator.save_evaluation_results('../results/model_evaluation/evaluation_metrics.json')

print("All evaluation results saved successfully!")
print("\nGenerated files:")
print("  - evaluation_metrics.json (comprehensive results)")
print("  - model_evaluation_summary.txt (human-readable summary)")
print("  - plots/ directory with all visualizations")
print("  - individual classification reports")

## 12. Model Selection Recommendations

Based on the comprehensive evaluation, here are the key findings and recommendations:

In [None]:
# Display final recommendations
print("FINAL MODEL RECOMMENDATIONS")
print("=" * 30)

# Find the best model based on different criteria
best_accuracy = max(all_metrics.items(), key=lambda x: x[1]['accuracy'])
best_f1 = max(all_metrics.items(), key=lambda x: x[1]['f1_score'])
best_roc_auc = max(all_metrics.items(), key=lambda x: x[1]['roc_auc'])

print(f"\n1. BEST OVERALL PERFORMANCE:")
print(f"   Model: {best_f1[0].replace('_', ' ').title()}")
print(f"   F1-Score: {best_f1[1]['f1_score']:.4f}")
print(f"   Reason: Best balance of precision and recall")

print(f"\n2. HIGHEST ACCURACY:")
print(f"   Model: {best_accuracy[0].replace('_', ' ').title()}")
print(f"   Accuracy: {best_accuracy[1]['accuracy']:.4f}")

print(f"\n3. BEST PROBABILITY RANKING:")
print(f"   Model: {best_roc_auc[0].replace('_', ' ').title()}")
print(f"   ROC-AUC: {best_roc_auc[1]['roc_auc']:.4f}")
print(f"   Reason: Best for ranking patients by disease risk")

print(f"\n4. CLINICAL CONSIDERATIONS:")
print(f"   - For screening (high recall needed): Focus on models with recall > 0.85")
print(f"   - For diagnosis confirmation (high precision needed): Focus on models with precision > 0.80")
print(f"   - For risk assessment: Use models with high ROC-AUC (> 0.90)")

print(f"\nEvaluation completed successfully! Check the results directory for detailed outputs.")