# Model Evaluation and mAP@50 Testing

This notebook evaluates all trained models on the test dataset and computes mAP@50 scores for each class.
The results are averaged across folds where multiple folds exist for the same model variant.

In [1]:
import os
import glob
import numpy as np
import pandas as pd
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Import YOLO and RT-DETR
from ultralytics import YOLO
try:
    from ultralytics import RTDETR
except ImportError:
    print("RT-DETR not available, will handle separately")
    RTDETR = None

# Set up paths
BASE_DIR = Path("../../")
RUNS_DIR = BASE_DIR / "runs"
TEST_DATASET = BASE_DIR / "datasets" / "roboflow" / "test"
DATA_YAML = BASE_DIR / "datasets" / "roboflow" / "data.yaml"

print(f"Base directory: {BASE_DIR}")
print(f"Runs directory: {RUNS_DIR}")
print(f"Test dataset: {TEST_DATASET}")
print(f"Data YAML: {DATA_YAML}")

Base directory: ../..
Runs directory: ../../runs
Test dataset: ../../datasets/roboflow/test
Data YAML: ../../datasets/roboflow/data.yaml


In [2]:
def discover_model_weights():
    """
    Discover all trained model weights in the runs directory
    Returns a dictionary with model information
    """
    models_info = {}
    
    # Define class names
    class_names = ['glass', 'metal', 'organic', 'paper', 'plastic']
    
    # Scan all stage directories
    for stage_dir in RUNS_DIR.glob("stage*"):
        print(f"\nScanning {stage_dir.name}:")
        
        # Extract stage info
        stage_name = stage_dir.name
        
        # Find all fold directories in this stage
        fold_dirs = list(stage_dir.glob("*fold*"))
        
        for fold_dir in fold_dirs:
            weights_dir = fold_dir / "weights"
            if weights_dir.exists():
                # Find best.pt file
                best_weights = weights_dir / "best.pt"
                if best_weights.exists():
                    print(f"  Found: {fold_dir.name} -> {best_weights}")
                    
                    # Extract model info
                    model_name = fold_dir.name.split('_fold_')[0]
                    fold_num = fold_dir.name.split('_fold_')[-1]

                    # todo: togliere queste righe quando yolo11m finisce di allenarsi
                    # Special handling for yolo11m: only use fold_0 (other folds still training)
                    if 'yolo11m' in model_name.lower() and fold_num != '0':
                        print(f"  Skipping {fold_dir.name} - using only fold_0 for yolo11m (other folds still training)")
                        continue
                    
                    # Determine model family and type
                    if 'rtdetr' in model_name.lower():
                        family = 'RT-DETRv2'
                        size = model_name.split('-')[-1].upper()
                    else:
                        family = 'YOLOv11'
                        size = model_name[-1].upper()  # Extract last character (n, s, m, l, x)
                    
                    # Store model info
                    if model_name not in models_info:
                        models_info[model_name] = {
                            'family': family,
                            'size': size,
                            'folds': {},
                            'stage': stage_name
                        }
                    
                    models_info[model_name]['folds'][fold_num] = {
                        'weights_path': str(best_weights),
                        'fold_dir': str(fold_dir)
                    }
                else:
                    print(f"  Missing best.pt: {fold_dir.name}")
    
    return models_info, class_names

# Discover all models
models_info, class_names = discover_model_weights()
print(f"\nFound {len(models_info)} unique models:")
for model_name, info in models_info.items():
    print(f"  {model_name}: {len(info['folds'])} folds ({info['family']}-{info['size']})")


Scanning stage2_yolo11m_k_fold_cv_augmented:
  Found: yolo11m_fold_0 -> ../../runs/stage2_yolo11m_k_fold_cv_augmented/yolo11m_fold_0/weights/best.pt
  Found: yolo11m_fold_1 -> ../../runs/stage2_yolo11m_k_fold_cv_augmented/yolo11m_fold_1/weights/best.pt
  Skipping yolo11m_fold_1 - using only fold_0 for yolo11m (other folds still training)
  Found: yolo11m_fold_2 -> ../../runs/stage2_yolo11m_k_fold_cv_augmented/yolo11m_fold_2/weights/best.pt
  Skipping yolo11m_fold_2 - using only fold_0 for yolo11m (other folds still training)

Scanning stage3_yolo11l_fold0_subsampled:
  Found: yolo11l_fold_0 -> ../../runs/stage3_yolo11l_fold0_subsampled/yolo11l_fold_0/weights/best.pt

Scanning stage1_yolo11n_k_fold_cv:
  Found: yolo11n_fold_3 -> ../../runs/stage1_yolo11n_k_fold_cv/yolo11n_fold_3/weights/best.pt
  Found: yolo11n_fold_2 -> ../../runs/stage1_yolo11n_k_fold_cv/yolo11n_fold_2/weights/best.pt
  Found: yolo11n_fold_1 -> ../../runs/stage1_yolo11n_k_fold_cv/yolo11n_fold_1/weights/best.pt
  Foun

In [None]:
import time
import cv2

def evaluate_model(weights_path, model_type='yolo'):
    """
    Evaluate a single model on the test dataset and return per-class mAP@50 and FPS
    """
    try:
        if model_type == 'yolo':
            # Load YOLO model
            model = YOLO(weights_path)
        elif model_type == 'rtdetr':
            # Load RT-DETR model
            if RTDETR is None:
                print(f"RT-DETR not available, skipping {weights_path}")
                return None
            model = RTDETR(weights_path)
        else:
            print(f"Unknown model type: {model_type}")
            return None
        
        print(f"Evaluating {weights_path}...")
        
        # Run validation on test dataset
        results = model.val(data=str(DATA_YAML), split='test', verbose=False)
        
        # Extract per-class mAP@50, P and R
        if hasattr(results, 'box') and hasattr(results.box, 'map50') and hasattr(results.box, 'p') and hasattr(results.box, 'r'):
            # Overall mAP@50
            overall_map50 = results.box.map50
            overall_P = results.box.p
            overall_R = results.box.r
            
            # Per-class mAP@50
            if hasattr(results.box, 'ap50') and results.box.ap50 is not None:
                per_class_map50 = results.box.ap50.tolist()
            else:
                print(f"Warning: No per-class mAP@50 available for {weights_path}")
                per_class_map50 = [0.0] * 5  # Default to zeros
            
            print(f"  Overall mAP@50: {overall_map50:.4f}")
            print(f"  Per-class mAP@50: {per_class_map50}")
            print(f"  Overall P: {overall_P:.4f}, Overall R: {overall_R:.4f}")
            
        else:
            print(f"Error: Could not extract mAP@50 from results for {weights_path}")
            return None
        
        # Measure FPS on test images
        print(f"  Measuring FPS...")
        test_images_dir = TEST_DATASET / "images"
        test_images = list(test_images_dir.glob("*.jpg")) + list(test_images_dir.glob("*.png"))
        
        # Use a subset of test images for FPS measurement (to save time)
        fps_test_images = test_images[:min(50, len(test_images))]  # Use max 50 images
        
        if len(fps_test_images) == 0:
            print(f"  Warning: No test images found for FPS measurement")
            mean_fps = 0.0
        else:
            # Warm up the model (first few predictions are usually slower)
            if len(fps_test_images) >= 3:
                for warmup_img in fps_test_images[:3]:
                    _ = model.predict(str(warmup_img), verbose=False)
            
            # Measure inference time
            inference_times = []
            
            for img_path in fps_test_images:
                start_time = time.time()
                _ = model.predict(str(img_path), verbose=False)
                end_time = time.time()
                inference_times.append(end_time - start_time)
            
            # Calculate mean FPS
            mean_inference_time = np.mean(inference_times)
            mean_fps = 1.0 / mean_inference_time if mean_inference_time > 0 else 0.0
            
            print(f"  Mean FPS: {mean_fps:.2f} ({len(fps_test_images)} images)")
        
        return {
            'overall_P': overall_P,
            'overall_R': overall_R,
            'overall_map50': overall_map50,
            'per_class_map50': per_class_map50,
            'class_names': class_names,
            'mean_fps': mean_fps,
            'num_fps_images': len(fps_test_images) if 'fps_test_images' in locals() else 0
        }
            
    except Exception as e:
        print(f"Error evaluating {weights_path}: {str(e)}")
        return None

In [6]:
# Evaluate all models and collect results
evaluation_results = {}

for model_name, model_info in models_info.items():
    print(f"\n{'='*60}")
    print(f"Evaluating {model_name} ({model_info['family']}-{model_info['size']})")
    print(f"{'='*60}")
    
    # Determine model type
    model_type = 'rtdetr' if 'rtdetr' in model_name.lower() else 'yolo'
    
    fold_results = {}
    
    # Evaluate each fold
    for fold_num, fold_info in model_info['folds'].items():
        weights_path = fold_info['weights_path']
        
        # Evaluate this fold
        result = evaluate_model(weights_path, model_type)
        
        if result is not None:
            fold_results[fold_num] = result
        else:
            print(f"  Failed to evaluate fold {fold_num}")
    
    # Store results for this model
    evaluation_results[model_name] = {
        'family': model_info['family'],
        'size': model_info['size'],
        'fold_results': fold_results,
        'stage': model_info['stage']
    }
    
    print(f"Completed {model_name}: {len(fold_results)} successful evaluations")

print(f"\n{'='*60}")
print(f"EVALUATION COMPLETE")
print(f"{'='*60}")
print(f"Total models evaluated: {len(evaluation_results)}")
for model_name, results in evaluation_results.items():
    print(f"  {model_name}: {len(results['fold_results'])} folds")


Evaluating yolo11m (YOLOv11-M)
Evaluating ../../runs/stage2_yolo11m_k_fold_cv_augmented/yolo11m_fold_0/weights/best.pt...
Ultralytics 8.3.167 🚀 Python-3.11.0rc1 torch-2.7.1+cu126 CUDA:0 (Tesla T4, 14914MiB)
YOLO11m summary (fused): 125 layers, 20,033,887 parameters, 0 gradients, 67.7 GFLOPs
[34m[1mval: [0mFast image access ✅ (ping: 0.0±0.0 ms, read: 997.8±910.4 MB/s, size: 22.9 KB)


[34m[1mval: [0mScanning /home/andrea/work/AI-waste-detection/datasets/roboflow/test/labels.cache... 1099 images, 0 backgrounds, 0 corrupt: 100%|██████████| 1099/1099 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 69/69 [01:14<00:00,  1.08s/it]


                   all       1099       1622      0.839      0.773      0.825       0.62
Speed: 0.6ms preprocess, 37.6ms inference, 0.0ms loss, 6.7ms postprocess per image
Results saved to [1mruns/detect/val84[0m
  Overall mAP@50: 0.8249
  Per-class mAP@50: [0.8936417429808664, 0.9120692754764765, 0.7414633482770655, 0.9349313183815487, 0.6423306064263941]
  Measuring FPS...
  Mean FPS: 15.75 (50 images)
Completed yolo11m: 1 successful evaluations

Evaluating yolo11l (YOLOv11-L)
Evaluating ../../runs/stage3_yolo11l_fold0_subsampled/yolo11l_fold_0/weights/best.pt...
Ultralytics 8.3.167 🚀 Python-3.11.0rc1 torch-2.7.1+cu126 CUDA:0 (Tesla T4, 14914MiB)
YOLO11l summary (fused): 190 layers, 25,283,167 parameters, 0 gradients, 86.6 GFLOPs
[34m[1mval: [0mFast image access ✅ (ping: 0.0±0.0 ms, read: 432.2±180.8 MB/s, size: 6.3 KB)


[34m[1mval: [0mScanning /home/andrea/work/AI-waste-detection/datasets/roboflow/test/labels.cache... 1099 images, 0 backgrounds, 0 corrupt: 100%|██████████| 1099/1099 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 69/69 [01:20<00:00,  1.16s/it]


                   all       1099       1622      0.831      0.758      0.809      0.609
Speed: 0.6ms preprocess, 45.2ms inference, 0.0ms loss, 6.4ms postprocess per image
Results saved to [1mruns/detect/val85[0m
  Overall mAP@50: 0.8086
  Per-class mAP@50: [0.841748902912257, 0.9206058394155782, 0.7249234944608487, 0.9426800391274945, 0.6128786899947374]
  Measuring FPS...
  Mean FPS: 24.23 (50 images)
Completed yolo11l: 1 successful evaluations

Evaluating yolo11n (YOLOv11-N)
Evaluating ../../runs/stage2_yolo11n_k_fold_cv_augmented/yolo11n_fold_3/weights/best.pt...
Ultralytics 8.3.167 🚀 Python-3.11.0rc1 torch-2.7.1+cu126 CUDA:0 (Tesla T4, 14914MiB)
YOLO11n summary (fused): 100 layers, 2,583,127 parameters, 0 gradients, 6.3 GFLOPs
[34m[1mval: [0mFast image access ✅ (ping: 0.0±0.0 ms, read: 1198.6±978.3 MB/s, size: 35.7 KB)


[34m[1mval: [0mScanning /home/andrea/work/AI-waste-detection/datasets/roboflow/test/labels.cache... 1099 images, 0 backgrounds, 0 corrupt: 100%|██████████| 1099/1099 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 69/69 [00:21<00:00,  3.26it/s]


                   all       1099       1622      0.849      0.752      0.807      0.587
Speed: 0.5ms preprocess, 4.7ms inference, 0.0ms loss, 3.8ms postprocess per image
Results saved to [1mruns/detect/val86[0m
  Overall mAP@50: 0.8072
  Per-class mAP@50: [0.8732610430756309, 0.912945488365606, 0.6916097160320844, 0.9440998554135069, 0.6138801161620073]
  Measuring FPS...
  Mean FPS: 31.89 (50 images)
Evaluating ../../runs/stage2_yolo11n_k_fold_cv_augmented/yolo11n_fold_2/weights/best.pt...
Ultralytics 8.3.167 🚀 Python-3.11.0rc1 torch-2.7.1+cu126 CUDA:0 (Tesla T4, 14914MiB)
YOLO11n summary (fused): 100 layers, 2,583,127 parameters, 0 gradients, 6.3 GFLOPs
[34m[1mval: [0mFast image access ✅ (ping: 0.0±0.0 ms, read: 682.7±177.8 MB/s, size: 10.0 KB)


[34m[1mval: [0mScanning /home/andrea/work/AI-waste-detection/datasets/roboflow/test/labels.cache... 1099 images, 0 backgrounds, 0 corrupt: 100%|██████████| 1099/1099 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 69/69 [00:21<00:00,  3.24it/s]


                   all       1099       1622      0.836      0.754      0.811      0.596
Speed: 0.4ms preprocess, 4.8ms inference, 0.0ms loss, 5.0ms postprocess per image
Results saved to [1mruns/detect/val87[0m
  Overall mAP@50: 0.8109
  Per-class mAP@50: [0.9114827121874811, 0.9150705882819365, 0.720994443953203, 0.9165631041429649, 0.5903758005719946]
  Measuring FPS...
  Mean FPS: 32.02 (50 images)
Evaluating ../../runs/stage2_yolo11n_k_fold_cv_augmented/yolo11n_fold_1/weights/best.pt...
Ultralytics 8.3.167 🚀 Python-3.11.0rc1 torch-2.7.1+cu126 CUDA:0 (Tesla T4, 14914MiB)
YOLO11n summary (fused): 100 layers, 2,583,127 parameters, 0 gradients, 6.3 GFLOPs
[34m[1mval: [0mFast image access ✅ (ping: 0.0±0.0 ms, read: 586.7±342.5 MB/s, size: 7.3 KB)


[34m[1mval: [0mScanning /home/andrea/work/AI-waste-detection/datasets/roboflow/test/labels.cache... 1099 images, 0 backgrounds, 0 corrupt: 100%|██████████| 1099/1099 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 69/69 [00:41<00:00,  1.67it/s]


                   all       1099       1622      0.832      0.753      0.798      0.582
Speed: 0.6ms preprocess, 6.3ms inference, 0.0ms loss, 8.0ms postprocess per image
Results saved to [1mruns/detect/val88[0m
  Overall mAP@50: 0.7977
  Per-class mAP@50: [0.8654187509980696, 0.9009492954346198, 0.6852179156701024, 0.9359716378917637, 0.6011506361910643]
  Measuring FPS...
  Mean FPS: 32.87 (50 images)
Evaluating ../../runs/stage2_yolo11n_k_fold_cv_augmented/yolo11n_fold_0/weights/best.pt...
Ultralytics 8.3.167 🚀 Python-3.11.0rc1 torch-2.7.1+cu126 CUDA:0 (Tesla T4, 14914MiB)
YOLO11n summary (fused): 100 layers, 2,583,127 parameters, 0 gradients, 6.3 GFLOPs
[34m[1mval: [0mFast image access ✅ (ping: 0.0±0.0 ms, read: 966.9±599.0 MB/s, size: 24.9 KB)


[34m[1mval: [0mScanning /home/andrea/work/AI-waste-detection/datasets/roboflow/test/labels.cache... 1099 images, 0 backgrounds, 0 corrupt: 100%|██████████| 1099/1099 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 69/69 [00:39<00:00,  1.75it/s]


                   all       1099       1622      0.815      0.774      0.794      0.578
Speed: 0.6ms preprocess, 6.6ms inference, 0.0ms loss, 8.5ms postprocess per image
Results saved to [1mruns/detect/val89[0m
  Overall mAP@50: 0.7939
  Per-class mAP@50: [0.8343910946579143, 0.8717068531703214, 0.7104386395151036, 0.9404710635676123, 0.6127285948747878]
  Measuring FPS...
  Mean FPS: 32.73 (50 images)
Evaluating ../../runs/stage2_yolo11n_k_fold_cv_augmented/yolo11n_fold_4/weights/best.pt...
Ultralytics 8.3.167 🚀 Python-3.11.0rc1 torch-2.7.1+cu126 CUDA:0 (Tesla T4, 14914MiB)
YOLO11n summary (fused): 100 layers, 2,583,127 parameters, 0 gradients, 6.3 GFLOPs
[34m[1mval: [0mFast image access ✅ (ping: 0.0±0.0 ms, read: 1132.2±1039.9 MB/s, size: 29.4 KB)


[34m[1mval: [0mScanning /home/andrea/work/AI-waste-detection/datasets/roboflow/test/labels.cache... 1099 images, 0 backgrounds, 0 corrupt: 100%|██████████| 1099/1099 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 69/69 [00:39<00:00,  1.75it/s]


                   all       1099       1622      0.815      0.772      0.812      0.594
Speed: 0.6ms preprocess, 6.8ms inference, 0.0ms loss, 8.4ms postprocess per image
Results saved to [1mruns/detect/val90[0m
  Overall mAP@50: 0.8118
  Per-class mAP@50: [0.8576248476834922, 0.9105992750155206, 0.7122822609866117, 0.9323209463256196, 0.645978012365353]
  Measuring FPS...
  Mean FPS: 33.00 (50 images)
Completed yolo11n: 5 successful evaluations

Evaluating rtdetr-l (RT-DETRv2-L)
Evaluating ../../runs/stage4_rtdetr-l_fold0_augmented/rtdetr-l_fold_0/weights/best.pt...
Ultralytics 8.3.167 🚀 Python-3.11.0rc1 torch-2.7.1+cu126 CUDA:0 (Tesla T4, 14914MiB)
rt-detr-l summary: 302 layers, 31,994,015 parameters, 0 gradients, 103.5 GFLOPs
[34m[1mval: [0mFast image access ✅ (ping: 0.0±0.0 ms, read: 1041.5±717.4 MB/s, size: 24.8 KB)


[34m[1mval: [0mScanning /home/andrea/work/AI-waste-detection/datasets/roboflow/test/labels.cache... 1099 images, 0 backgrounds, 0 corrupt: 100%|██████████| 1099/1099 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 69/69 [01:51<00:00,  1.62s/it]


                   all       1099       1622      0.829      0.699      0.754      0.549
Speed: 0.8ms preprocess, 67.4ms inference, 0.0ms loss, 1.9ms postprocess per image
Results saved to [1mruns/detect/val91[0m
  Overall mAP@50: 0.7537
  Per-class mAP@50: [0.8306480484253883, 0.8587076079492546, 0.650119978828882, 0.9142512612524027, 0.5149256486878995]
  Measuring FPS...
  Mean FPS: 9.51 (50 images)
Completed rtdetr-l: 1 successful evaluations

Evaluating rtdetr-x (RT-DETRv2-X)
Evaluating ../../runs/stage4_rtdetr-x_fold0_augmented/rtdetr-x_fold_0/weights/best.pt...
Ultralytics 8.3.167 🚀 Python-3.11.0rc1 torch-2.7.1+cu126 CUDA:0 (Tesla T4, 14914MiB)
rt-detr-x summary: 373 layers, 65,477,711 parameters, 0 gradients, 222.5 GFLOPs
[34m[1mval: [0mFast image access ✅ (ping: 0.0±0.0 ms, read: 705.6±491.2 MB/s, size: 9.9 KB)


[34m[1mval: [0mScanning /home/andrea/work/AI-waste-detection/datasets/roboflow/test/labels.cache... 1099 images, 0 backgrounds, 0 corrupt: 100%|██████████| 1099/1099 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 69/69 [02:39<00:00,  2.31s/it]


                   all       1099       1622      0.799      0.711       0.74       0.53
Speed: 0.8ms preprocess, 112.1ms inference, 0.0ms loss, 1.4ms postprocess per image
Results saved to [1mruns/detect/val92[0m
  Overall mAP@50: 0.7399
  Per-class mAP@50: [0.8469243364084952, 0.8463501917738496, 0.6534918123387711, 0.868694725134842, 0.4842382013152916]
  Measuring FPS...
  Mean FPS: 6.26 (50 images)
Completed rtdetr-x: 1 successful evaluations

Evaluating yolo11s (YOLOv11-S)
Evaluating ../../runs/stage2_yolo11s_k_fold_cv_augmented/yolo11s_fold_4/weights/best.pt...
Ultralytics 8.3.167 🚀 Python-3.11.0rc1 torch-2.7.1+cu126 CUDA:0 (Tesla T4, 14914MiB)
YOLO11s summary (fused): 100 layers, 9,414,735 parameters, 0 gradients, 21.3 GFLOPs
[34m[1mval: [0mFast image access ✅ (ping: 0.0±0.0 ms, read: 491.9±84.8 MB/s, size: 7.7 KB)


[34m[1mval: [0mScanning /home/andrea/work/AI-waste-detection/datasets/roboflow/test/labels.cache... 1099 images, 0 backgrounds, 0 corrupt: 100%|██████████| 1099/1099 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 69/69 [00:48<00:00,  1.41it/s]


                   all       1099       1622      0.865      0.748      0.814      0.609
Speed: 0.7ms preprocess, 15.6ms inference, 0.0ms loss, 8.0ms postprocess per image
Results saved to [1mruns/detect/val93[0m
  Overall mAP@50: 0.8136
  Per-class mAP@50: [0.8798861780450407, 0.9243683408414906, 0.717796044823451, 0.9322669995606471, 0.6136884808705939]
  Measuring FPS...
  Mean FPS: 23.86 (50 images)
Evaluating ../../runs/stage2_yolo11s_k_fold_cv_augmented/yolo11s_fold_3/weights/best.pt...
Ultralytics 8.3.167 🚀 Python-3.11.0rc1 torch-2.7.1+cu126 CUDA:0 (Tesla T4, 14914MiB)
YOLO11s summary (fused): 100 layers, 9,414,735 parameters, 0 gradients, 21.3 GFLOPs
[34m[1mval: [0mFast image access ✅ (ping: 0.0±0.0 ms, read: 788.7±254.9 MB/s, size: 10.3 KB)


[34m[1mval: [0mScanning /home/andrea/work/AI-waste-detection/datasets/roboflow/test/labels.cache... 1099 images, 0 backgrounds, 0 corrupt: 100%|██████████| 1099/1099 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 69/69 [00:47<00:00,  1.44it/s]


                   all       1099       1622      0.825      0.771      0.819      0.615
Speed: 0.6ms preprocess, 15.8ms inference, 0.0ms loss, 8.1ms postprocess per image
Results saved to [1mruns/detect/val94[0m
  Overall mAP@50: 0.8186
  Per-class mAP@50: [0.8954202688019743, 0.9006918101993759, 0.7415182032031533, 0.9239055729922252, 0.6314879766523191]
  Measuring FPS...
  Mean FPS: 25.53 (50 images)
Evaluating ../../runs/stage2_yolo11s_k_fold_cv_augmented/yolo11s_fold_2/weights/best.pt...
Ultralytics 8.3.167 🚀 Python-3.11.0rc1 torch-2.7.1+cu126 CUDA:0 (Tesla T4, 14914MiB)
YOLO11s summary (fused): 100 layers, 9,414,735 parameters, 0 gradients, 21.3 GFLOPs
[34m[1mval: [0mFast image access ✅ (ping: 0.0±0.0 ms, read: 692.3±225.2 MB/s, size: 11.2 KB)


[34m[1mval: [0mScanning /home/andrea/work/AI-waste-detection/datasets/roboflow/test/labels.cache... 1099 images, 0 backgrounds, 0 corrupt: 100%|██████████| 1099/1099 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 69/69 [00:50<00:00,  1.38it/s]


                   all       1099       1622      0.813      0.781       0.82      0.615
Speed: 0.7ms preprocess, 15.6ms inference, 0.0ms loss, 7.6ms postprocess per image
Results saved to [1mruns/detect/val95[0m
  Overall mAP@50: 0.8203
  Per-class mAP@50: [0.9062221311876116, 0.9022015544422646, 0.7267870715335883, 0.9322952006956019, 0.6339653270545093]
  Measuring FPS...
  Mean FPS: 26.07 (50 images)
Evaluating ../../runs/stage2_yolo11s_k_fold_cv_augmented/yolo11s_fold_1/weights/best.pt...
Ultralytics 8.3.167 🚀 Python-3.11.0rc1 torch-2.7.1+cu126 CUDA:0 (Tesla T4, 14914MiB)
YOLO11s summary (fused): 100 layers, 9,414,735 parameters, 0 gradients, 21.3 GFLOPs
[34m[1mval: [0mFast image access ✅ (ping: 0.0±0.0 ms, read: 1919.2±1863.2 MB/s, size: 60.5 KB)


[34m[1mval: [0mScanning /home/andrea/work/AI-waste-detection/datasets/roboflow/test/labels.cache... 1099 images, 0 backgrounds, 0 corrupt: 100%|██████████| 1099/1099 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 69/69 [00:49<00:00,  1.38it/s]


                   all       1099       1622      0.839      0.774      0.818      0.615
Speed: 0.7ms preprocess, 15.6ms inference, 0.0ms loss, 7.0ms postprocess per image
Results saved to [1mruns/detect/val96[0m
  Overall mAP@50: 0.8181
  Per-class mAP@50: [0.8773070016038595, 0.9167832625275951, 0.7433782994646206, 0.9467486812539049, 0.6062292122120075]
  Measuring FPS...
  Mean FPS: 26.88 (50 images)
Evaluating ../../runs/stage2_yolo11s_k_fold_cv_augmented/yolo11s_fold_0/weights/best.pt...
Ultralytics 8.3.167 🚀 Python-3.11.0rc1 torch-2.7.1+cu126 CUDA:0 (Tesla T4, 14914MiB)
YOLO11s summary (fused): 100 layers, 9,414,735 parameters, 0 gradients, 21.3 GFLOPs
[34m[1mval: [0mFast image access ✅ (ping: 0.0±0.0 ms, read: 1068.0±932.0 MB/s, size: 32.4 KB)


[34m[1mval: [0mScanning /home/andrea/work/AI-waste-detection/datasets/roboflow/test/labels.cache... 1099 images, 0 backgrounds, 0 corrupt: 100%|██████████| 1099/1099 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 69/69 [00:25<00:00,  2.72it/s]


                   all       1099       1622      0.854      0.754       0.82      0.611
Speed: 0.4ms preprocess, 10.0ms inference, 0.0ms loss, 3.3ms postprocess per image
Results saved to [1mruns/detect/val97[0m
  Overall mAP@50: 0.8197
  Per-class mAP@50: [0.9022776775784258, 0.8944060805643207, 0.7323139333758623, 0.9514663915678255, 0.6179617212106178]
  Measuring FPS...
  Mean FPS: 40.46 (50 images)
Completed yolo11s: 5 successful evaluations

Evaluating yolo11x (YOLOv11-X)
Evaluating ../../runs/stage3_yolo11x_fold0_subsampled/yolo11x_fold_0/weights/best.pt...
Ultralytics 8.3.167 🚀 Python-3.11.0rc1 torch-2.7.1+cu126 CUDA:0 (Tesla T4, 14914MiB)
YOLO11x summary (fused): 190 layers, 56,832,799 parameters, 0 gradients, 194.4 GFLOPs
[34m[1mval: [0mFast image access ✅ (ping: 0.0±0.0 ms, read: 809.0±337.1 MB/s, size: 10.2 KB)


[34m[1mval: [0mScanning /home/andrea/work/AI-waste-detection/datasets/roboflow/test/labels.cache... 1099 images, 0 backgrounds, 0 corrupt: 100%|██████████| 1099/1099 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 69/69 [01:33<00:00,  1.36s/it]


                   all       1099       1622       0.84      0.749      0.811       0.61
Speed: 0.4ms preprocess, 70.0ms inference, 0.0ms loss, 3.5ms postprocess per image
Results saved to [1mruns/detect/val98[0m
  Overall mAP@50: 0.8110
  Per-class mAP@50: [0.8551476679619787, 0.9066016957685561, 0.7284465573052452, 0.9187055050620077, 0.6463144575577993]
  Measuring FPS...
  Mean FPS: 8.34 (50 images)
Completed yolo11x: 1 successful evaluations

EVALUATION COMPLETE
Total models evaluated: 7
  yolo11m: 1 folds
  yolo11l: 1 folds
  yolo11n: 5 folds
  rtdetr-l: 1 folds
  rtdetr-x: 1 folds
  yolo11s: 5 folds
  yolo11x: 1 folds


In [7]:
# Calculate average and standard deviation across folds
summary_results = []

for model_name, results in evaluation_results.items():
    if len(results['fold_results']) == 0:
        print(f"No results for {model_name}, skipping...")
        continue
    
    # Collect per-class mAP@50 values across all folds
    all_per_class_maps = []
    all_overall_maps = []
    all_fps = []
    all_Ps = []
    all_Rs = []
    
    for fold_num, fold_result in results['fold_results'].items():
        all_per_class_maps.append(fold_result['per_class_map50'])
        all_overall_maps.append(fold_result['overall_map50'])
        all_fps.append(fold_result.get('mean_fps', 0.0))
        all_Ps.append(fold_result.get('overall_P', 0.0))
        all_Rs.append(fold_result.get('overall_R', 0.0))
    
    # Convert to numpy array for easier calculation
    per_class_array = np.array(all_per_class_maps)  # Shape: (n_fooverall_P = lds, n_classes)
    overall_array = np.array(all_overall_maps)
    fps_array = np.array(all_fps)
    overall_P_array = np.array(all_Ps)
    overall_R_array = np.array(all_Rs)
    
    # Calculate mean and std for each class
    mean_per_class = np.mean(per_class_array, axis=0)
    std_per_class = np.std(per_class_array, axis=0) if len(all_per_class_maps) > 1 else np.zeros_like(mean_per_class)
    
    # Calculate mean and std for overall mAP@50
    mean_overall = np.mean(overall_array)
    std_overall = np.std(overall_array) if len(all_overall_maps) > 1 else 0.0
    
    # Calculate mean and std for FPS
    mean_fps = np.mean(fps_array)
    std_fps = np.std(fps_array) if len(all_fps) > 1 else 0.0

    # Calculate mean and std for overall P and R
    mean_P = np.mean(overall_P_array)
    std_P = np.std(overall_P_array) if len(all_Ps) > 1 else 0.0
    mean_R = np.mean(overall_R_array)
    std_R = np.std(overall_R_array) if len(all_Rs) > 1 else 0.0
    
    # Create summary entry
    summary_entry = {
        'model_name': model_name,
        'family': results['family'],
        'size': results['size'],
        'n_folds': len(results['fold_results']),
        'mean_P': mean_P,
        'std_P': std_P,
        'mean_R': mean_R,
        'std_R': std_R,
        'mean_overall_map50': mean_overall,
        'std_overall_map50': std_overall,
        'mean_per_class_map50': mean_per_class.tolist(),
        'std_per_class_map50': std_per_class.tolist(),
        'mean_fps': mean_fps,
        'std_fps': std_fps,
        'stage': results['stage']
    }
    
    summary_results.append(summary_entry)
    
    print(f"{model_name} ({results['family']}-{results['size']}):")
    print(f"  Overall mAP@50: {mean_overall:.4f} ± {std_overall:.4f} ({len(results['fold_results'])} folds)")
    print(f"  Mean FPS: {mean_fps:.2f} ± {std_fps:.2f}")
    print(f"  Per-class mAP@50: {[f'{m:.4f}±{s:.4f}' for m, s in zip(mean_per_class, std_per_class)]}")
    print(f"  Overall P: {mean_P:.4f} ± {std_P:.4f}")
    print(f"  Overall R: {mean_R:.4f} ± {std_R:.4f}")

print(f"\nSummary complete for {len(summary_results)} models")

yolo11m (YOLOv11-M):
  Overall mAP@50: 0.8249 ± 0.0000 (1 folds)
  Mean FPS: 15.75 ± 0.00
  Per-class mAP@50: ['0.8936±0.0000', '0.9121±0.0000', '0.7415±0.0000', '0.9349±0.0000', '0.6423±0.0000']
  Overall P: 0.8387 ± 0.0000
  Overall R: 0.7730 ± 0.0000
yolo11l (YOLOv11-L):
  Overall mAP@50: 0.8086 ± 0.0000 (1 folds)
  Mean FPS: 24.23 ± 0.00
  Per-class mAP@50: ['0.8417±0.0000', '0.9206±0.0000', '0.7249±0.0000', '0.9427±0.0000', '0.6129±0.0000']
  Overall P: 0.8313 ± 0.0000
  Overall R: 0.7577 ± 0.0000
yolo11n (YOLOv11-N):
  Overall mAP@50: 0.8043 ± 0.0072 (5 folds)
  Mean FPS: 32.50 ± 0.46
  Per-class mAP@50: ['0.8684±0.0251', '0.9023±0.0160', '0.7041±0.0135', '0.9339±0.0095', '0.6128±0.0187']
  Overall P: 0.8294 ± 0.0753
  Overall R: 0.7608 ± 0.1396
rtdetr-l (RT-DETRv2-L):
  Overall mAP@50: 0.7537 ± 0.0000 (1 folds)
  Mean FPS: 9.51 ± 0.00
  Per-class mAP@50: ['0.8306±0.0000', '0.8587±0.0000', '0.6501±0.0000', '0.9143±0.0000', '0.5149±0.0000']
  Overall P: 0.8289 ± 0.0000
  Overall R

In [8]:
# Display performance summary table (mAP@50 and FPS)
def display_performance_summary(summary_results):
    """Display a summary table with overall mAP@50 and FPS for each model"""
    
    print("\n" + "="*80)
    print("MODEL PERFORMANCE SUMMARY")
    print("="*80)
    print(f"{'Model':<20} {'Family':<12} {'Size':<6} {'mAP@50':<12} {'P':<12} {'R':<12} {'FPS':<12} {'Folds':<6}")
    print("-"*80)
    
    # Sort results by family and size
    def sort_key(result):
        family = result['family']
        size = result['size']
        size_order = {'N': 0, 'S': 1, 'M': 2, 'L': 3, 'X': 4}
        return (family, size_order.get(size, 999))
    
    sorted_results = sorted(summary_results, key=sort_key)
    
    for result in sorted_results:
        model_name = result['model_name']
        family = result['family']
        size = result['size']
        n_folds = result['n_folds']
        
        # Format mAP@50
        if result['n_folds'] > 1 and result['std_overall_map50'] > 0.001:
            map50_str = f"{result['mean_overall_map50']:.3f}±{result['std_overall_map50']:.3f}"
        else:
            map50_str = f"{result['mean_overall_map50']:.3f}"

        # Format P and R
        if result['n_folds'] > 1 and result['std_P'] > 0.001:
            p_str = f"{result['mean_P']:.3f}±{result['std_P']:.3f}"
        else:
            p_str = f"{result['mean_P']:.3f}"

        if result['n_folds'] > 1 and result['std_R'] > 0.001:
            r_str = f"{result['mean_R']:.3f}±{result['std_R']:.3f}"
        else:
            r_str = f"{result['mean_R']:.3f}"
        
        # Format FPS
        if result['n_folds'] > 1 and result['std_fps'] > 0.1:
            fps_str = f"{result['mean_fps']:.1f}±{result['std_fps']:.1f}"
        else:
            fps_str = f"{result['mean_fps']:.1f}"
        
        fold_str = f"{n_folds}"
        if n_folds == 1:
            fold_str += "*"
        
        print(f"{model_name:<20} {family:<12} {size:<6} {map50_str:<12} {p_str:<12} {r_str:<12} {fps_str:<12} {fold_str:<6}")
    
    print("-"*80)
    print("* Models evaluated on single fold only")
    print("="*80)

# Display the summary
display_performance_summary(summary_results)


MODEL PERFORMANCE SUMMARY
Model                Family       Size   mAP@50       P            R            FPS          Folds 
--------------------------------------------------------------------------------
rtdetr-l             RT-DETRv2    L      0.754        0.829        0.699        9.5          1*    
rtdetr-x             RT-DETRv2    X      0.740        0.799        0.711        6.3          1*    
yolo11n              YOLOv11      N      0.804±0.007  0.829±0.075  0.761±0.140  32.5±0.5     5     
yolo11s              YOLOv11      S      0.818±0.002  0.839±0.077  0.766±0.140  28.6±6.0     5     
yolo11m              YOLOv11      M      0.825        0.839        0.773        15.8         1*    
yolo11l              YOLOv11      L      0.809        0.831        0.758        24.2         1*    
yolo11x              YOLOv11      X      0.811        0.840        0.749        8.3          1*    
--------------------------------------------------------------------------------
* Models ev

In [11]:
# Generate LaTeX table
def generate_latex_table(summary_results, class_names):
    """Generate LaTeX table with per-class mAP@50 results including mean column"""

    # Sort results by family and size
    def sort_key(result):
        family = result['family']
        size = result['size'].upper()
        size_order = {'N': 0, 'S': 1, 'M': 2, 'L': 3, 'X': 4}
        return (family, size_order.get(size, 999))

    sorted_results = sorted(summary_results, key=sort_key)

    # Start LaTeX table
    latex_table = []
    latex_table.append("\\begin{table*}[t]")
    latex_table.append("\\centering")
    latex_table.append("\\caption{Per-class AP@50 for each model variant. Results are averaged over multiple folds. \\\\")
    latex_table.append("Models marked with * were evaluated only on one fold.}")
    latex_table.append("\\label{tab:map50_per_class}")
    latex_table.append("\\begin{tabular}{lll|" + "c" * len(class_names) + "|c}")
    latex_table.append("\\toprule")

    # Header
    header = "\\textbf{Model} & \\textbf{Family} & \\textbf{Size}"
    for class_name in class_names:
        header += f" & \\textbf{{{class_name.capitalize()}}}"
    header += " & \\textbf{mAP@50}"
    header += " & \\textbf{P} \\ & \\textbf{R} && \\textbf{F1} \\\\"
    latex_table.append(header)
    latex_table.append("\\midrule")

    # Add rows
    for result in sorted_results:
        model_display = f"{result['family']}-{result['size'].lower()}"
        if result['n_folds'] == 1:
            model_display += "*"

        family = result['family']
        size = result['size'].lower()

        row = f"{model_display} & {family} & {size}"

        per_class_values = []
        for mean_map, std_map in zip(result['mean_per_class_map50'], result['std_per_class_map50']):
            per_class_values.append(mean_map)
            if result['n_folds'] > 1 and std_map > 0.001:
                row += f" & {mean_map:.2f}$\\pm${std_map:.2f}"
            else:
                row += f" & {mean_map:.2f}"

        mean_map = sum(per_class_values) / len(per_class_values)
        row += f" & \\textbf{{{mean_map:.3f}}}"

        # add P, R and F1
        row += f" & {result['mean_P']:.3f}"
        if result['n_folds'] > 1 and result['std_P'] > 0.001:
            row += f"$\\pm${result['std_P']:.3f}"
        
        row += f" & {result['mean_R']:.3f}"
        if result['n_folds'] > 1 and result['std_R'] > 0.001:
            row += f"$\\pm${result['std_R']:.3f}"
        
        f1_score = 2 * (result['mean_P'] * result['mean_R']) / (result['mean_P'] + result['mean_R']) if (result['mean_P'] + result['mean_R']) > 0 else 0.0
        row += f" & {f1_score:.3f} \\\\"
        latex_table.append(row)

    # Add midrule between families if needed
    yolo_count = sum(1 for r in sorted_results if r['family'] == 'YOLOv11')
    rtdetr_count = sum(1 for r in sorted_results if r['family'] == 'RT-DETRv2')

    if yolo_count > 0 and rtdetr_count > 0:
        # Find the index where RT-DETR models start
        rtdetr_start_idx = None
        for i, line in enumerate(latex_table):
            if "RT-DETRv2" in line:
                rtdetr_start_idx = i
                break

        if rtdetr_start_idx is not None:
            latex_table.insert(rtdetr_start_idx, "\\midrule")

    latex_table.append("\\bottomrule")
    latex_table.append("\\end{tabular}%")
    latex_table.append("\\end{table*}")

    return "\n".join(latex_table)

print("\nGenerating LaTeX table...")
latex_table = generate_latex_table(summary_results, class_names)
print("=" * 80)
print(latex_table)
print("=" * 80)



Generating LaTeX table...
\begin{table*}[t]
\centering
\caption{Per-class AP@50 for each model variant. Results are averaged over multiple folds. \\
Models marked with * were evaluated only on one fold.}
\label{tab:map50_per_class}
\begin{tabular}{lll|ccccc|c}
\toprule
\textbf{Model} & \textbf{Family} & \textbf{Size} & \textbf{Glass} & \textbf{Metal} & \textbf{Organic} & \textbf{Paper} & \textbf{Plastic} & \textbf{mAP@50} & \textbf{P} \ & \textbf{R} && \textbf{F1} \\
\midrule
\midrule
RT-DETRv2-l* & RT-DETRv2 & l & 0.83 & 0.86 & 0.65 & 0.91 & 0.51 & \textbf{0.754} & 0.829 & 0.699 & 0.758 \\
RT-DETRv2-x* & RT-DETRv2 & x & 0.85 & 0.85 & 0.65 & 0.87 & 0.48 & \textbf{0.740} & 0.799 & 0.711 & 0.753 \\
YOLOv11-n & YOLOv11 & n & 0.87$\pm$0.03 & 0.90$\pm$0.02 & 0.70$\pm$0.01 & 0.93$\pm$0.01 & 0.61$\pm$0.02 & \textbf{0.804} & 0.829$\pm$0.075 & 0.761$\pm$0.140 & 0.794 \\
YOLOv11-s & YOLOv11 & s & 0.89$\pm$0.01 & 0.91$\pm$0.01 & 0.73$\pm$0.01 & 0.94$\pm$0.01 & 0.62$\pm$0.01 & \textbf{0.818} & 0.