# Model Evaluation and mAP@50 Testing

This notebook evaluates all trained models on the test dataset and computes mAP@50 scores for each class.
The results are averaged across folds where multiple folds exist for the same model variant.

In [None]:
import os
import glob
import numpy as np
import pandas as pd
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Import YOLO and RT-DETR
from ultralytics import YOLO
try:
    from ultralytics import RTDETR
except ImportError:
    print("RT-DETR not available, will handle separately")
    RTDETR = None

# Set up paths
BASE_DIR = Path("../../")
RUNS_DIR = BASE_DIR / "runs"
TEST_DATASET = BASE_DIR / "datasets" / "roboflow" / "test"
DATA_YAML = BASE_DIR / "datasets" / "roboflow" / "data.yaml"

print(f"Base directory: {BASE_DIR}")
print(f"Runs directory: {RUNS_DIR}")
print(f"Test dataset: {TEST_DATASET}")
print(f"Data YAML: {DATA_YAML}")

In [None]:
def discover_model_weights():
    """
    Discover all trained model weights in the runs directory
    Returns a dictionary with model information
    """
    models_info = {}
    
    # Define class names
    class_names = ['glass', 'metal', 'organic', 'paper', 'plastic']
    
    # Scan all stage directories
    for stage_dir in RUNS_DIR.glob("stage*"):
        print(f"\nScanning {stage_dir.name}:")
        
        # Extract stage info
        stage_name = stage_dir.name
        
        # Find all fold directories in this stage
        fold_dirs = list(stage_dir.glob("*fold*"))
        
        for fold_dir in fold_dirs:
            weights_dir = fold_dir / "weights"
            if weights_dir.exists():
                # Find best.pt file
                best_weights = weights_dir / "best.pt"
                if best_weights.exists():
                    print(f"  Found: {fold_dir.name} -> {best_weights}")
                    
                    # Extract model info
                    model_name = fold_dir.name.split('_fold_')[0]
                    fold_num = fold_dir.name.split('_fold_')[-1]
                    
                    # Special handling for yolo11m: only use fold_0 (other folds still training)
                    if 'yolo11m' in model_name.lower() and fold_num != '0':
                        print(f"  Skipping {fold_dir.name} - using only fold_0 for yolo11m (other folds still training)")
                        continue
                    
                    # Determine model family and type
                    if 'rtdetr' in model_name.lower():
                        family = 'RT-DETRv2'
                        size = model_name.split('-')[-1].upper()
                    else:
                        family = 'YOLOv11'
                        size = model_name[-1].upper()  # Extract last character (n, s, m, l, x)
                    
                    # Store model info
                    if model_name not in models_info:
                        models_info[model_name] = {
                            'family': family,
                            'size': size,
                            'folds': {},
                            'stage': stage_name
                        }
                    
                    models_info[model_name]['folds'][fold_num] = {
                        'weights_path': str(best_weights),
                        'fold_dir': str(fold_dir)
                    }
                else:
                    print(f"  Missing best.pt: {fold_dir.name}")
    
    return models_info, class_names

# Discover all models
models_info, class_names = discover_model_weights()
print(f"\nFound {len(models_info)} unique models:")
for model_name, info in models_info.items():
    print(f"  {model_name}: {len(info['folds'])} folds ({info['family']}-{info['size']})")

In [None]:
import time
import cv2

def evaluate_model(weights_path, model_type='yolo'):
    """
    Evaluate a single model on the test dataset and return per-class mAP@50 and FPS
    """
    try:
        if model_type == 'yolo':
            # Load YOLO model
            model = YOLO(weights_path)
        elif model_type == 'rtdetr':
            # Load RT-DETR model
            if RTDETR is None:
                print(f"RT-DETR not available, skipping {weights_path}")
                return None
            model = RTDETR(weights_path)
        else:
            print(f"Unknown model type: {model_type}")
            return None
        
        print(f"Evaluating {weights_path}...")
        
        # Run validation on test dataset
        results = model.val(data=str(DATA_YAML), split='test', verbose=False)
        
        # Extract per-class mAP@50
        if hasattr(results, 'box') and hasattr(results.box, 'map50'):
            # Overall mAP@50
            overall_map50 = results.box.map50
            
            # Per-class mAP@50
            if hasattr(results.box, 'ap50') and results.box.ap50 is not None:
                per_class_map50 = results.box.ap50.tolist()
            else:
                print(f"Warning: No per-class mAP@50 available for {weights_path}")
                per_class_map50 = [0.0] * 5  # Default to zeros
            
            print(f"  Overall mAP@50: {overall_map50:.4f}")
            print(f"  Per-class mAP@50: {per_class_map50}")
            
        else:
            print(f"Error: Could not extract mAP@50 from results for {weights_path}")
            return None
        
        # Measure FPS on test images
        print(f"  Measuring FPS...")
        test_images_dir = TEST_DATASET / "images"
        test_images = list(test_images_dir.glob("*.jpg")) + list(test_images_dir.glob("*.png"))
        
        # Use a subset of test images for FPS measurement (to save time)
        fps_test_images = test_images[:min(50, len(test_images))]  # Use max 50 images
        
        if len(fps_test_images) == 0:
            print(f"  Warning: No test images found for FPS measurement")
            mean_fps = 0.0
        else:
            # Warm up the model (first few predictions are usually slower)
            if len(fps_test_images) >= 3:
                for warmup_img in fps_test_images[:3]:
                    _ = model.predict(str(warmup_img), verbose=False)
            
            # Measure inference time
            inference_times = []
            
            for img_path in fps_test_images:
                start_time = time.time()
                _ = model.predict(str(img_path), verbose=False)
                end_time = time.time()
                inference_times.append(end_time - start_time)
            
            # Calculate mean FPS
            mean_inference_time = np.mean(inference_times)
            mean_fps = 1.0 / mean_inference_time if mean_inference_time > 0 else 0.0
            
            print(f"  Mean FPS: {mean_fps:.2f} ({len(fps_test_images)} images)")
        
        return {
            'overall_map50': overall_map50,
            'per_class_map50': per_class_map50,
            'class_names': class_names,
            'mean_fps': mean_fps,
            'num_fps_images': len(fps_test_images) if 'fps_test_images' in locals() else 0
        }
            
    except Exception as e:
        print(f"Error evaluating {weights_path}: {str(e)}")
        return None

In [None]:
# Evaluate all models and collect results
evaluation_results = {}

for model_name, model_info in models_info.items():
    print(f"\n{'='*60}")
    print(f"Evaluating {model_name} ({model_info['family']}-{model_info['size']})")
    print(f"{'='*60}")
    
    # Determine model type
    model_type = 'rtdetr' if 'rtdetr' in model_name.lower() else 'yolo'
    
    fold_results = {}
    
    # Evaluate each fold
    for fold_num, fold_info in model_info['folds'].items():
        weights_path = fold_info['weights_path']
        
        # Evaluate this fold
        result = evaluate_model(weights_path, model_type)
        
        if result is not None:
            fold_results[fold_num] = result
        else:
            print(f"  Failed to evaluate fold {fold_num}")
    
    # Store results for this model
    evaluation_results[model_name] = {
        'family': model_info['family'],
        'size': model_info['size'],
        'fold_results': fold_results,
        'stage': model_info['stage']
    }
    
    print(f"Completed {model_name}: {len(fold_results)} successful evaluations")

print(f"\n{'='*60}")
print(f"EVALUATION COMPLETE")
print(f"{'='*60}")
print(f"Total models evaluated: {len(evaluation_results)}")
for model_name, results in evaluation_results.items():
    print(f"  {model_name}: {len(results['fold_results'])} folds")

In [None]:
# Calculate average and standard deviation across folds
summary_results = []

for model_name, results in evaluation_results.items():
    if len(results['fold_results']) == 0:
        print(f"No results for {model_name}, skipping...")
        continue
    
    # Collect per-class mAP@50 values across all folds
    all_per_class_maps = []
    all_overall_maps = []
    all_fps = []
    
    for fold_num, fold_result in results['fold_results'].items():
        all_per_class_maps.append(fold_result['per_class_map50'])
        all_overall_maps.append(fold_result['overall_map50'])
        all_fps.append(fold_result.get('mean_fps', 0.0))
    
    # Convert to numpy array for easier calculation
    per_class_array = np.array(all_per_class_maps)  # Shape: (n_folds, n_classes)
    overall_array = np.array(all_overall_maps)
    fps_array = np.array(all_fps)
    
    # Calculate mean and std for each class
    mean_per_class = np.mean(per_class_array, axis=0)
    std_per_class = np.std(per_class_array, axis=0) if len(all_per_class_maps) > 1 else np.zeros_like(mean_per_class)
    
    # Calculate mean and std for overall mAP@50
    mean_overall = np.mean(overall_array)
    std_overall = np.std(overall_array) if len(all_overall_maps) > 1 else 0.0
    
    # Calculate mean and std for FPS
    mean_fps = np.mean(fps_array)
    std_fps = np.std(fps_array) if len(all_fps) > 1 else 0.0
    
    # Create summary entry
    summary_entry = {
        'model_name': model_name,
        'family': results['family'],
        'size': results['size'],
        'n_folds': len(results['fold_results']),
        'mean_overall_map50': mean_overall,
        'std_overall_map50': std_overall,
        'mean_per_class_map50': mean_per_class.tolist(),
        'std_per_class_map50': std_per_class.tolist(),
        'mean_fps': mean_fps,
        'std_fps': std_fps,
        'stage': results['stage']
    }
    
    summary_results.append(summary_entry)
    
    print(f"{model_name} ({results['family']}-{results['size']}):")
    print(f"  Overall mAP@50: {mean_overall:.4f} ± {std_overall:.4f} ({len(results['fold_results'])} folds)")
    print(f"  Mean FPS: {mean_fps:.2f} ± {std_fps:.2f}")
    print(f"  Per-class mAP@50: {[f'{m:.4f}±{s:.4f}' for m, s in zip(mean_per_class, std_per_class)]}")

print(f"\nSummary complete for {len(summary_results)} models")

In [None]:
# Display performance summary table (mAP@50 and FPS)
def display_performance_summary(summary_results):
    """Display a summary table with overall mAP@50 and FPS for each model"""
    
    print("\n" + "="*80)
    print("MODEL PERFORMANCE SUMMARY")
    print("="*80)
    print(f"{'Model':<20} {'Family':<12} {'Size':<6} {'mAP@50':<12} {'FPS':<12} {'Folds':<6}")
    print("-"*80)
    
    # Sort results by family and size
    def sort_key(result):
        family = result['family']
        size = result['size']
        size_order = {'N': 0, 'S': 1, 'M': 2, 'L': 3, 'X': 4}
        return (family, size_order.get(size, 999))
    
    sorted_results = sorted(summary_results, key=sort_key)
    
    for result in sorted_results:
        model_name = result['model_name']
        family = result['family']
        size = result['size']
        n_folds = result['n_folds']
        
        # Format mAP@50
        if result['n_folds'] > 1 and result['std_overall_map50'] > 0.001:
            map50_str = f"{result['mean_overall_map50']:.3f}±{result['std_overall_map50']:.3f}"
        else:
            map50_str = f"{result['mean_overall_map50']:.3f}"
        
        # Format FPS
        if result['n_folds'] > 1 and result['std_fps'] > 0.1:
            fps_str = f"{result['mean_fps']:.1f}±{result['std_fps']:.1f}"
        else:
            fps_str = f"{result['mean_fps']:.1f}"
        
        fold_str = f"{n_folds}"
        if n_folds == 1:
            fold_str += "*"
        
        print(f"{model_name:<20} {family:<12} {size:<6} {map50_str:<12} {fps_str:<12} {fold_str:<6}")
    
    print("-"*80)
    print("* Models evaluated on single fold only")
    print("="*80)

# Display the summary
display_performance_summary(summary_results)

In [None]:
# Generate LaTeX table
def generate_latex_table(summary_results, class_names):
    """Generate LaTeX table with per-class mAP@50 results including mean column"""

    # Sort results by family and size
    def sort_key(result):
        family = result['family']
        size = result['size'].upper()
        size_order = {'N': 0, 'S': 1, 'M': 2, 'L': 3, 'X': 4}
        return (family, size_order.get(size, 999))

    sorted_results = sorted(summary_results, key=sort_key)

    # Start LaTeX table
    latex_table = []
    latex_table.append("\\begin{table*}[t]")
    latex_table.append("\\centering")
    latex_table.append("\\caption{Per-class AP@50 for each model variant. Results are averaged over multiple folds. \\\\")
    latex_table.append("Models marked with * were evaluated only on one fold.}")
    latex_table.append("\\label{tab:map50_per_class}")
    latex_table.append("\\begin{tabular}{lll" + "c" * len(class_names) + "c}")
    latex_table.append("\\toprule")

    # Header
    header = "\\textbf{Model} & \\textbf{Family} & \\textbf{Size}"
    for class_name in class_names:
        header += f" & \\textbf{{{class_name.capitalize()}}}"
    header += " & \\textbf{mAP@50} \\\\"
    latex_table.append(header)
    latex_table.append("\\midrule")

    # Add rows
    for result in sorted_results:
        model_display = f"{result['family']}-{result['size'].lower()}"
        if result['n_folds'] == 1:
            model_display += "*"

        family = result['family']
        size = result['size'].lower()

        row = f"{model_display} & {family} & {size}"

        per_class_values = []
        for mean_map, std_map in zip(result['mean_per_class_map50'], result['std_per_class_map50']):
            per_class_values.append(mean_map)
            if result['n_folds'] > 1 and std_map > 0.001:
                row += f" & {mean_map:.2f}$\\pm${std_map:.2f}"
            else:
                row += f" & {mean_map:.2f}"

        mean_map = sum(per_class_values) / len(per_class_values)
        row += f" & \\textbf{{{mean_map:.3f}}} \\\\"
        latex_table.append(row)

    # Add midrule between families if needed
    yolo_count = sum(1 for r in sorted_results if r['family'] == 'YOLOv11')
    rtdetr_count = sum(1 for r in sorted_results if r['family'] == 'RT-DETRv2')

    if yolo_count > 0 and rtdetr_count > 0:
        # Find the index where RT-DETR models start
        rtdetr_start_idx = None
        for i, line in enumerate(latex_table):
            if "RT-DETRv2" in line:
                rtdetr_start_idx = i
                break

        if rtdetr_start_idx is not None:
            latex_table.insert(rtdetr_start_idx, "\\midrule")

    latex_table.append("\\bottomrule")
    latex_table.append("\\end{tabular}%")
    latex_table.append("\\end{table*}")

    return "\n".join(latex_table)

print("\nGenerating LaTeX table...")
latex_table = generate_latex_table(summary_results, class_names)
print("=" * 80)
print(latex_table)
print("=" * 80)
