# Model Evaluation

This notebook demonstrates the comprehensive evaluation of computer vision models, including metrics computation and visualization.

In [None]:
# Install dependencies
!pip install scikit-learn matplotlib seaborn pycocotools

In [None]:
# Import required modules
import mlflow
import torch
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from evaluation.metrics.classification_metrics import ClassificationMetrics
from evaluation.metrics.detection_metrics import DetectionMetrics
from evaluation.metrics.segmentation_metrics import SegmentationMetrics
from evaluation.reports.performance_analyzer import PerformanceAnalyzer

## Load Model and Data

Load the trained model and test data for evaluation.

In [None]:
# Load model from MLflow
model_uri = "models:/cv_model/1"
model = mlflow.pytorch.load_model(model_uri)
model.eval()

# Load test data
test_data = torch.load("/dbfs/path/to/test_data.pt")
test_loader = torch.utils.data.DataLoader(
    test_data,
    batch_size=32,
    shuffle=False
)

## Initialize Metrics

Set up evaluation metrics for different computer vision tasks.

In [None]:
# Initialize metrics
metrics = {
    "classification": ClassificationMetrics(num_classes=10),
    "detection": DetectionMetrics(iou_threshold=0.5),
    "segmentation": SegmentationMetrics(num_classes=10)
}

# Initialize performance analyzer
analyzer = PerformanceAnalyzer()

## Generate Predictions

Run inference on test data to generate predictions.

In [None]:
def generate_predictions(model, dataloader):
    predictions = []
    ground_truth = []

    with torch.no_grad():
        for batch in dataloader:
            x, y = batch
            y_hat = model(x)
            predictions.append(y_hat)
            ground_truth.append(y)

    return torch.cat(predictions), torch.cat(ground_truth)

# Generate predictions
predictions, ground_truth = generate_predictions(model, test_loader)

## Compute Metrics

Calculate evaluation metrics for the model.

In [None]:
# Compute metrics for each task
results = {}
for task, metric in metrics.items():
    results[task] = metric.compute_metrics(predictions, ground_truth)
    print(f"\n{task.upper()} Metrics:")
    for metric_name, value in results[task].items():
        print(f"{metric_name}: {value:.4f}")

## Generate Visualizations

Create visualizations for model performance analysis.

In [None]:
# Plot confusion matrix
confusion_matrix = metrics['classification'].plot_confusion_matrix(
    ground_truth,
    predictions.argmax(dim=1),
    class_names=[f'Class {i}' for i in range(10)]
)
plt.show()

# Plot ROC curves
roc_curves = metrics['classification'].plot_roc_curve(
    ground_truth,
    predictions,
    class_names=[f'Class {i}' for i in range(10)]
)
plt.show()

## Performance Analysis

Analyze model performance across different aspects.

In [None]:
# Analyze performance by class
class_performance = analyzer.analyze_by_class(predictions, ground_truth)
print("\nPerformance by Class:")
display(class_performance)

# Analyze error patterns
error_analysis = analyzer.analyze_errors(predictions, ground_truth)
print("\nError Analysis:")
display(error_analysis)

## Generate Evaluation Report

Create a comprehensive evaluation report.

In [None]:
# Generate report
report = analyzer.generate_report(results, class_performance, error_analysis)

# Save report
report_path = "/dbfs/path/to/evaluation_report.html"
with open(report_path, 'w') as f:
    f.write(report)
print(f"Evaluation report saved to: {report_path}")