# Model Performance Comparison & Analysis

This section demonstrates how to compare different YOLO models and formats for optimal performance on Raspberry Pi 5.

In [None]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from ultralytics import YOLO
from pathlib import Path
import glob
import psutil
import json

# Set up plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("🔍 Model Performance Comparison Tool")
print("====================================")

# Find all available models
model_dir = Path("../models")
model_files = []

# Get PyTorch models
pt_models = list(model_dir.glob("*.pt"))
model_files.extend(pt_models)

# Get ONNX models  
onnx_models = list(model_dir.glob("*.onnx"))
model_files.extend(onnx_models)

print(f"📦 Found {len(model_files)} models:")
for model in sorted(model_files):
    size_mb = model.stat().st_size / (1024 * 1024)
    print(f"  • {model.name}: {size_mb:.1f} MB")

print("\n✅ Ready for model comparison!")

In [None]:
def quick_model_comparison(models_subset=None, test_duration=10, image_size=(640, 480)):
    """
    Perform a quick performance comparison of YOLO models.
    
    Args:
        models_subset: List of model names to test (None for all)
        test_duration: Test duration in seconds per model
        image_size: Input image size (width, height)
    
    Returns:
        DataFrame with comparison results
    """
    
    # Create test image
    test_image = np.random.randint(0, 255, (*image_size, 3), dtype=np.uint8)
    
    # Select models to test
    if models_subset:
        test_models = [m for m in model_files if m.name in models_subset]
    else:
        # Test a representative subset for quick comparison
        test_models = [m for m in model_files if any(x in m.name for x in ['yolov8n', 'yolov5n', 'yolov10n'])]
    
    results = []
    
    print(f"🔥 Quick comparison of {len(test_models)} models...")
    print(f"⏱️ {test_duration}s per model, {image_size} image size\n")
    
    for i, model_path in enumerate(test_models, 1):
        model_name = model_path.name
        print(f"[{i}/{len(test_models)}] Testing {model_name}...")
        
        try:
            # Load model
            start_load = time.time()
            model = YOLO(str(model_path), verbose=False)
            load_time = time.time() - start_load
            
            # Warmup
            for _ in range(3):
                _ = model(test_image, verbose=False)
            
            # Benchmark
            start_time = time.time()
            frame_count = 0
            inference_times = []
            
            while time.time() - start_time < test_duration:
                inf_start = time.time()
                _ = model(test_image, verbose=False)
                inference_times.append(time.time() - inf_start)
                frame_count += 1
            
            total_time = time.time() - start_time
            avg_fps = frame_count / total_time
            avg_inference_ms = (sum(inference_times) / len(inference_times)) * 1000
            
            # Model info
            model_size_mb = model_path.stat().st_size / (1024 * 1024)
            model_format = 'PyTorch' if model_path.suffix == '.pt' else 'ONNX'
            
            # Categorize model
            if 'n' in model_name.lower():
                size_category = 'Nano'
            elif 's' in model_name.lower():
                size_category = 'Small'
            elif 'm' in model_name.lower():
                size_category = 'Medium'
            else:
                size_category = 'Other'
            
            result = {
                'Model': model_name,
                'Format': model_format,
                'Category': size_category,
                'Size_MB': round(model_size_mb, 1),
                'Load_Time_s': round(load_time, 2),
                'FPS': round(avg_fps, 1),
                'Inference_ms': round(avg_inference_ms, 1),
                'FPS_per_MB': round(avg_fps / model_size_mb, 2),
                'Frames_Tested': frame_count
            }
            
            results.append(result)
            print(f"  ✅ {avg_fps:.1f} FPS, {avg_inference_ms:.1f}ms inference")
            
        except Exception as e:
            print(f"  ❌ Failed: {str(e)}")
            
    return pd.DataFrame(results) if results else pd.DataFrame()

# Run quick comparison
comparison_df = quick_model_comparison(test_duration=10)

if not comparison_df.empty:
    print("\n📊 Quick Comparison Results:")
    print("=" * 50)
    display(comparison_df.sort_values('FPS', ascending=False))
else:
    print("❌ No models could be tested")

In [None]:
# Visualize comparison results
if not comparison_df.empty:
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    fig.suptitle('YOLO Model Performance Comparison on Raspberry Pi 5', fontsize=16, fontweight='bold')
    
    # 1. FPS vs Model Size
    ax1 = axes[0, 0]
    scatter = ax1.scatter(comparison_df['Size_MB'], comparison_df['FPS'], 
                         c=comparison_df['Format'].map({'PyTorch': 'red', 'ONNX': 'blue'}),
                         s=100, alpha=0.7)
    ax1.set_xlabel('Model Size (MB)')
    ax1.set_ylabel('FPS')
    ax1.set_title('FPS vs Model Size')
    ax1.grid(True, alpha=0.3)
    
    # Add model names as annotations
    for i, row in comparison_df.iterrows():
        ax1.annotate(row['Model'].replace('.pt', '').replace('.onnx', ''), 
                    (row['Size_MB'], row['FPS']), 
                    xytext=(5, 5), textcoords='offset points', 
                    fontsize=8, alpha=0.8)
    
    # Legend
    from matplotlib.lines import Line2D
    legend_elements = [Line2D([0], [0], marker='o', color='w', markerfacecolor='red', markersize=10, label='PyTorch'),
                      Line2D([0], [0], marker='o', color='w', markerfacecolor='blue', markersize=10, label='ONNX')]
    ax1.legend(handles=legend_elements)
    
    # 2. FPS by Format
    ax2 = axes[0, 1]
    format_fps = comparison_df.groupby('Format')['FPS'].mean()
    bars = ax2.bar(format_fps.index, format_fps.values, color=['red', 'blue'], alpha=0.7)
    ax2.set_ylabel('Average FPS')
    ax2.set_title('Average FPS by Format')
    ax2.grid(True, alpha=0.3)
    
    # Add value labels on bars
    for bar in bars:
        height = bar.get_height()
        ax2.annotate(f'{height:.1f}',
                    xy=(bar.get_x() + bar.get_width() / 2, height),
                    xytext=(0, 3), textcoords="offset points",
                    ha='center', va='bottom')
    
    # 3. Efficiency (FPS per MB)
    ax3 = axes[1, 0]
    efficiency_sorted = comparison_df.sort_values('FPS_per_MB', ascending=True)
    bars = ax3.barh(range(len(efficiency_sorted)), efficiency_sorted['FPS_per_MB'], 
                   color=efficiency_sorted['Format'].map({'PyTorch': 'red', 'ONNX': 'blue'}),
                   alpha=0.7)
    ax3.set_yticks(range(len(efficiency_sorted)))
    ax3.set_yticklabels([name.replace('.pt', '').replace('.onnx', '') for name in efficiency_sorted['Model']])
    ax3.set_xlabel('FPS per MB')
    ax3.set_title('Model Efficiency (FPS/MB)')
    ax3.grid(True, alpha=0.3)
    
    # 4. Inference Time vs FPS
    ax4 = axes[1, 1]
    scatter = ax4.scatter(comparison_df['Inference_ms'], comparison_df['FPS'],
                         c=comparison_df['Format'].map({'PyTorch': 'red', 'ONNX': 'blue'}),
                         s=100, alpha=0.7)
    ax4.set_xlabel('Inference Time (ms)')
    ax4.set_ylabel('FPS')
    ax4.set_title('Inference Time vs FPS')
    ax4.grid(True, alpha=0.3)
    
    # Add ideal performance line
    ideal_fps = 1000 / comparison_df['Inference_ms']  # Theoretical max FPS
    ax4.plot(comparison_df['Inference_ms'], ideal_fps, 'g--', alpha=0.5, label='Theoretical Max')
    ax4.legend()
    
    plt.tight_layout()
    plt.show()
    
    # Print analysis
    print("\n🎯 Performance Analysis:")
    print("=" * 40)
    
    best_fps = comparison_df.loc[comparison_df['FPS'].idxmax()]
    most_efficient = comparison_df.loc[comparison_df['FPS_per_MB'].idxmax()]
    smallest = comparison_df.loc[comparison_df['Size_MB'].idxmin()]
    
    print(f"🚀 Fastest Model: {best_fps['Model']} ({best_fps['FPS']} FPS)")
    print(f"⚖️ Most Efficient: {most_efficient['Model']} ({most_efficient['FPS_per_MB']} FPS/MB)")
    print(f"📦 Smallest Model: {smallest['Model']} ({smallest['Size_MB']} MB)")
    
    # Format comparison
    if len(comparison_df['Format'].unique()) > 1:
        pytorch_avg = comparison_df[comparison_df['Format'] == 'PyTorch']['FPS'].mean()
        onnx_avg = comparison_df[comparison_df['Format'] == 'ONNX']['FPS'].mean()
        
        if pd.notna(pytorch_avg) and pd.notna(onnx_avg):
            improvement = ((onnx_avg - pytorch_avg) / pytorch_avg) * 100
            print(f"📈 ONNX vs PyTorch: {improvement:+.1f}% performance difference")
    
    # Recommendations
    print(f"\n💡 Recommendations:")
    print("=" * 40)
    real_time_models = comparison_df[comparison_df['FPS'] >= 10]
    if not real_time_models.empty:
        print(f"🔥 Real-time capable (≥10 FPS): {len(real_time_models)} models")
        for _, model in real_time_models.iterrows():
            print(f"   • {model['Model']}: {model['FPS']} FPS")
    
    interactive_models = comparison_df[(comparison_df['FPS'] >= 5) & (comparison_df['FPS'] < 10)]
    if not interactive_models.empty:
        print(f"⚡ Interactive capable (5-10 FPS): {len(interactive_models)} models")
        
    print(f"\n🎮 Use Cases:")
    print("   • Real-time detection: Choose models with ≥10 FPS")
    print("   • Interactive applications: 5-10 FPS models acceptable")
    print("   • Batch processing: Any model suitable")
    print("   • Consider ONNX format for better performance")

else:
    print("No comparison data available for visualization")

# YOLOv8 Model Fine-Tuning and Export for Raspberry Pi 5

This notebook demonstrates how to:
1. Fine-tune YOLOv8-nano on a custom 5-class dataset
2. Export the trained model to ONNX format
3. Convert to NCNN int8 for optimized Pi 5 inference
4. Validate performance improvements

**Target Hardware:** Raspberry Pi 5 (8GB RAM) with Intel RealSense D435

**Prerequisites:**
- Custom dataset with 5 classes in YOLO format
- Sufficient training time (recommend running on GPU first, then deploying to Pi 5)
- Internet connection for downloading base models

In [None]:
# Import required libraries
import os
import sys
import yaml
import torch
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from ultralytics import YOLO
import cv2
from PIL import Image
import shutil
import requests
from datetime import datetime

# Check PyTorch and CUDA availability
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA device count: {torch.cuda.device_count()}")

# Set device for training
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Set project paths
project_root = Path.cwd().parent
dataset_dir = project_root / "datasets" / "custom_5class"
models_dir = project_root / "models"
export_dir = project_root / "models" / "exported"

# Create directories
for dir_path in [dataset_dir, models_dir, export_dir]:
    dir_path.mkdir(parents=True, exist_ok=True)
    
print(f"Project root: {project_root}")
print(f"Dataset directory: {dataset_dir}")
print(f"Models directory: {models_dir}")

## 1. Dataset Preparation

We'll create a sample 5-class dataset for demonstration. In practice, you would replace this with your own annotated data.

**Classes for this example:**
1. `person` - Human figures
2. `vehicle` - Cars, trucks, motorcycles  
3. `animal` - Dogs, cats, birds
4. `device` - Laptops, phones, cameras
5. `furniture` - Chairs, tables, couches

**Dataset Structure:**
```
datasets/custom_5class/
├── images/
│   ├── train/
│   ├── val/
│   └── test/
├── labels/
│   ├── train/
│   ├── val/
│   └── test/
└── data.yaml
```

In [None]:
# Create dataset configuration file
dataset_config = {
    'train': str(dataset_dir / 'images' / 'train'),
    'val': str(dataset_dir / 'images' / 'val'),
    'test': str(dataset_dir / 'images' / 'test'),
    'nc': 5,  # Number of classes
    'names': {
        0: 'person',
        1: 'vehicle', 
        2: 'animal',
        3: 'device',
        4: 'furniture'
    }
}

# Save dataset configuration
config_path = dataset_dir / 'data.yaml'
with open(config_path, 'w') as f:
    yaml.dump(dataset_config, f, default_flow_style=False)

print(f"Dataset configuration saved to: {config_path}")

# Create directory structure
for split in ['train', 'val', 'test']:
    (dataset_dir / 'images' / split).mkdir(parents=True, exist_ok=True)
    (dataset_dir / 'labels' / split).mkdir(parents=True, exist_ok=True)

print("Dataset directory structure created")

# Function to create synthetic training data (for demonstration)
def create_synthetic_data(num_samples=50, split='train'):
    """Create synthetic data with random colored rectangles as objects."""
    images_dir = dataset_dir / 'images' / split
    labels_dir = dataset_dir / 'labels' / split
    
    for i in range(num_samples):
        # Create synthetic image (640x640)
        img = np.random.randint(50, 200, (640, 640, 3), dtype=np.uint8)
        
        labels = []
        num_objects = np.random.randint(1, 4)  # 1-3 objects per image
        
        for j in range(num_objects):
            # Random class
            class_id = np.random.randint(0, 5)
            
            # Random bounding box (YOLO format: x_center, y_center, width, height)
            x_center = np.random.uniform(0.2, 0.8)
            y_center = np.random.uniform(0.2, 0.8)
            width = np.random.uniform(0.1, 0.3)
            height = np.random.uniform(0.1, 0.3)
            
            # Draw colored rectangle on image (for visualization)
            x1 = int((x_center - width/2) * 640)
            y1 = int((y_center - height/2) * 640)
            x2 = int((x_center + width/2) * 640)
            y2 = int((y_center + height/2) * 640)
            
            color = [
                (255, 0, 0),    # person - red
                (0, 255, 0),    # vehicle - green  
                (0, 0, 255),    # animal - blue
                (255, 255, 0),  # device - yellow
                (255, 0, 255)   # furniture - magenta
            ][class_id]
            
            cv2.rectangle(img, (x1, y1), (x2, y2), color, 3)
            
            labels.append(f"{class_id} {x_center:.6f} {y_center:.6f} {width:.6f} {height:.6f}")
        
        # Save image
        img_path = images_dir / f"{split}_{i:04d}.jpg"
        cv2.imwrite(str(img_path), img)
        
        # Save labels
        label_path = labels_dir / f"{split}_{i:04d}.txt"
        with open(label_path, 'w') as f:
            f.write('\n'.join(labels))
    
    print(f"Created {num_samples} synthetic {split} samples")

# Create synthetic datasets
create_synthetic_data(100, 'train')  # 100 training images
create_synthetic_data(20, 'val')     # 20 validation images  
create_synthetic_data(10, 'test')    # 10 test images

print("Synthetic dataset creation complete!")

## 2. Model Fine-Tuning

Now we'll fine-tune YOLOv8-nano on our custom 5-class dataset. The training process will:

1. Load the pre-trained YOLOv8n model
2. Modify the output layer for 5 classes
3. Train on our custom dataset
4. Validate and save the best model

**Training Parameters:**
- Base model: YOLOv8n (nano)
- Epochs: 50 (adjust based on convergence)
- Image size: 640x640 (can reduce to 480x480 for Pi 5)
- Batch size: 16 (adjust based on available memory)

In [None]:
# Load base YOLOv8n model
print("Loading YOLOv8-nano base model...")
model = YOLO('yolov8n.pt')  # Automatically downloads if not present

# Training configuration
train_config = {
    'data': str(config_path),           # Path to dataset config
    'epochs': 50,                       # Number of training epochs
    'imgsz': 640,                       # Image size (can reduce to 480 for Pi 5)
    'batch': 16,                        # Batch size (adjust for available RAM)
    'workers': 4,                       # Number of data loading workers
    'device': device,                   # Training device
    'project': str(models_dir),         # Project directory
    'name': 'custom_5class_v1',         # Experiment name
    'save_period': 10,                  # Save checkpoint every N epochs
    'patience': 10,                     # Early stopping patience
    'optimizer': 'AdamW',               # Optimizer
    'lr0': 0.01,                        # Initial learning rate
    'lrf': 0.01,                        # Final learning rate fraction
    'momentum': 0.937,                  # SGD momentum
    'weight_decay': 0.0005,             # Weight decay
    'warmup_epochs': 3,                 # Warmup epochs
    'warmup_momentum': 0.8,             # Warmup momentum
    'box': 7.5,                         # Box loss gain
    'cls': 0.5,                         # Classification loss gain
    'dfl': 1.5,                         # DFL loss gain
    'pose': 12.0,                       # Pose loss gain (if applicable)
    'kobj': 1.0,                        # Keypoint object loss gain
    'label_smoothing': 0.0,             # Label smoothing
    'nbs': 64,                          # Nominal batch size
    'hsv_h': 0.015,                     # Image HSV-Hue augmentation
    'hsv_s': 0.7,                       # Image HSV-Saturation augmentation
    'hsv_v': 0.4,                       # Image HSV-Value augmentation
    'degrees': 0.0,                     # Image rotation (+/- deg)
    'translate': 0.1,                   # Image translation (+/- fraction)
    'scale': 0.5,                       # Image scale (+/- gain)
    'shear': 0.0,                       # Image shear (+/- deg)
    'perspective': 0.0,                 # Image perspective (+/- fraction)
    'flipud': 0.0,                      # Image flip up-down (probability)
    'fliplr': 0.5,                      # Image flip left-right (probability)
    'mosaic': 1.0,                      # Image mosaic (probability)
    'mixup': 0.0,                       # Image mixup (probability)
    'copy_paste': 0.0,                  # Segment copy-paste (probability)
    'auto_augment': 'randaugment',      # Auto augmentation policy
    'erasing': 0.4,                     # Random erasing probability
    'crop_fraction': 1.0,               # Image crop fraction
}

print(f"Training configuration:")
for key, value in train_config.items():
    print(f"  {key}: {value}")

# Start training
print(f"\nStarting training on {device}...")
results = model.train(**train_config)

print("Training completed!")
print(f"Best model saved to: {results.save_dir}")

# Display training results
best_model_path = results.save_dir / 'weights' / 'best.pt'
print(f"Best model path: {best_model_path}")

In [None]:
# Load the best trained model for validation
best_model = YOLO(str(best_model_path))

# Validate the model
print("Validating trained model...")
validation_results = best_model.val(data=str(config_path))

print(f"Validation mAP50: {validation_results.results_dict['metrics/mAP50(B)']:.4f}")
print(f"Validation mAP50-95: {validation_results.results_dict['metrics/mAP50-95(B)']:.4f}")

# Test on a few sample images
test_images_dir = dataset_dir / 'images' / 'test'
test_images = list(test_images_dir.glob('*.jpg'))[:3]  # Test on first 3 images

print(f"\nTesting on {len(test_images)} sample images...")

fig, axes = plt.subplots(1, len(test_images), figsize=(15, 5))
if len(test_images) == 1:
    axes = [axes]

for i, img_path in enumerate(test_images):
    # Run inference
    results = best_model(str(img_path))
    
    # Get annotated image
    annotated = results[0].plot()
    
    # Convert BGR to RGB for matplotlib
    annotated_rgb = cv2.cvtColor(annotated, cv2.COLOR_BGR2RGB)
    
    axes[i].imshow(annotated_rgb)
    axes[i].set_title(f"Test Image {i+1}")
    axes[i].axis('off')
    
    # Print detection results
    if results[0].boxes is not None:
        boxes = results[0].boxes
        print(f"Image {i+1}: {len(boxes)} detections")
        for j, box in enumerate(boxes):
            class_id = int(box.cls.cpu().numpy())
            confidence = float(box.conf.cpu().numpy())
            class_name = dataset_config['names'][class_id]
            print(f"  {j+1}: {class_name} ({confidence:.3f})")
    else:
        print(f"Image {i+1}: No detections")

plt.tight_layout()
plt.show()

# Performance analysis
model_info = best_model.info()
print(f"\nModel Information:")
print(f"Parameters: {model_info[0]:,}")
print(f"Gradients: {model_info[1]:,}") 
print(f"Layers: {model_info[2]}")
print(f"Model size: {best_model_path.stat().st_size / (1024*1024):.2f} MB")

## 3. Model Export to ONNX

ONNX (Open Neural Network Exchange) format provides better inference performance on CPU-based systems like the Raspberry Pi 5. We'll export our trained model to ONNX and validate the conversion.

**Benefits of ONNX for Pi 5:**
- Optimized CPU inference
- Reduced memory usage
- Better integration with onnxruntime
- Cross-platform compatibility

In [None]:
# Export model to ONNX format
print("Exporting model to ONNX format...")

try:
    # Export with optimizations for Pi 5
    onnx_path = best_model.export(
        format='onnx',
        imgsz=480,              # Reduced size for Pi 5 performance
        optimize=True,          # Enable ONNX optimizations
        simplify=True,          # Simplify ONNX graph
        dynamic=False,          # Fixed input size for better performance
        opset=12,              # ONNX opset version (compatible with Pi 5)
        half=False,            # Use FP32 (Pi 5 doesn't have good FP16 support)
        int8=False,            # We'll handle quantization separately
        device=device
    )
    
    print(f"ONNX model exported to: {onnx_path}")
    
    # Move to export directory
    onnx_export_path = export_dir / 'custom_5class_480.onnx'
    shutil.copy2(onnx_path, onnx_export_path)
    print(f"ONNX model copied to: {onnx_export_path}")
    
    # Check file size
    onnx_size = onnx_export_path.stat().st_size / (1024*1024)
    print(f"ONNX model size: {onnx_size:.2f} MB")
    
except Exception as e:
    print(f"ONNX export failed: {e}")
    onnx_export_path = None

# Validate ONNX model if export succeeded
if onnx_export_path and onnx_export_path.exists():
    print("\nValidating ONNX model...")
    
    try:
        import onnxruntime as ort
        
        # Create ONNX Runtime session
        ort_session = ort.InferenceSession(str(onnx_export_path))
        
        # Get input/output info
        input_info = ort_session.get_inputs()[0]
        output_info = ort_session.get_outputs()
        
        print(f"ONNX Input: {input_info.name} {input_info.shape} {input_info.type}")
        print(f"ONNX Outputs: {len(output_info)} outputs")
        for i, output in enumerate(output_info):
            print(f"  Output {i}: {output.name} {output.shape} {output.type}")
        
        # Test inference with dummy data
        dummy_input = np.random.randn(1, 3, 480, 480).astype(np.float32)
        outputs = ort_session.run(None, {input_info.name: dummy_input})
        
        print(f"ONNX inference test successful!")
        print(f"Output shapes: {[output.shape for output in outputs]}")
        
        # Benchmark ONNX inference speed
        import time
        
        num_runs = 10
        start_time = time.time()
        
        for _ in range(num_runs):
            outputs = ort_session.run(None, {input_info.name: dummy_input})
        
        total_time = time.time() - start_time
        avg_time = total_time / num_runs
        fps = 1.0 / avg_time
        
        print(f"\nONNX Performance Benchmark:")
        print(f"Average inference time: {avg_time:.4f}s")
        print(f"Estimated FPS: {fps:.2f}")
        
    except ImportError:
        print("onnxruntime not available for validation")
    except Exception as e:
        print(f"ONNX validation failed: {e}")

else:
    print("ONNX export failed - skipping validation")

## 4. Model Export to NCNN int8 (Advanced Optimization)

NCNN is Tencent's optimized neural network inference framework, particularly well-suited for mobile and embedded devices like the Raspberry Pi 5. Int8 quantization further reduces model size and increases inference speed.

**Benefits of NCNN int8 for Pi 5:**
- Significantly faster inference (2-4x speedup)
- Reduced memory usage (4x smaller)
- Optimized for ARM processors
- Lower power consumption

**Note:** NCNN export requires additional setup and may need calibration data for int8 quantization.

In [None]:
# Attempt NCNN export (requires additional dependencies)
print("Attempting NCNN export...")

try:
    # First, try direct NCNN export (if supported)
    ncnn_path = best_model.export(
        format='ncnn',
        imgsz=480,
        half=False,
        int8=True,  # Enable int8 quantization
        device=device
    )
    
    print(f"NCNN model exported to: {ncnn_path}")
    
    # NCNN export creates .param and .bin files
    ncnn_param_path = export_dir / 'custom_5class_480.param'
    ncnn_bin_path = export_dir / 'custom_5class_480.bin'
    
    # Move files to export directory
    if isinstance(ncnn_path, str):
        ncnn_dir = Path(ncnn_path).parent
        param_file = list(ncnn_dir.glob('*.param'))[0]
        bin_file = list(ncnn_dir.glob('*.bin'))[0]
        
        shutil.copy2(param_file, ncnn_param_path)
        shutil.copy2(bin_file, ncnn_bin_path)
        
        print(f"NCNN files copied to:")
        print(f"  Param: {ncnn_param_path}")
        print(f"  Binary: {ncnn_bin_path}")
        
        # Check file sizes
        param_size = ncnn_param_path.stat().st_size / 1024
        bin_size = ncnn_bin_path.stat().st_size / (1024*1024)
        
        print(f"NCNN param size: {param_size:.2f} KB")
        print(f"NCNN binary size: {bin_size:.2f} MB")
        
        ncnn_success = True
    else:
        ncnn_success = False
        
except Exception as e:
    print(f"Direct NCNN export failed: {e}")
    ncnn_success = False

# Alternative: Convert ONNX to NCNN (if direct export failed)
if not ncnn_success and onnx_export_path and onnx_export_path.exists():
    print("\nAttempting ONNX to NCNN conversion...")
    
    try:
        # This requires onnx-simplifier and custom conversion tools
        # For now, we'll provide instructions for manual conversion
        print("Manual NCNN conversion steps:")
        print("1. Install NCNN tools:")
        print("   git clone https://github.com/Tencent/ncnn.git")
        print("   cd ncnn && mkdir build && cd build")
        print("   cmake .. && make -j4")
        print("")
        print("2. Convert ONNX to NCNN:")
        print(f"   ./tools/onnx/onnx2ncnn {onnx_export_path} custom_5class_480.param custom_5class_480.bin")
        print("")
        print("3. Quantize to int8 (optional):")
        print("   ./tools/quantize/ncnn2int8 custom_5class_480.param custom_5class_480.bin custom_5class_480_int8.param custom_5class_480_int8.bin calibration_images/")
        print("")
        print("Note: int8 quantization requires calibration images from your dataset")
        
    except Exception as e:
        print(f"ONNX to NCNN conversion guidance failed: {e}")

# Create calibration dataset for int8 quantization
print("\nPreparing calibration data for int8 quantization...")

calibration_dir = export_dir / 'calibration_images'
calibration_dir.mkdir(exist_ok=True)

# Copy a subset of validation images for calibration
val_images_dir = dataset_dir / 'images' / 'val'
val_images = list(val_images_dir.glob('*.jpg'))[:10]  # Use 10 images for calibration

for i, img_path in enumerate(val_images):
    # Load and resize image to model input size
    img = cv2.imread(str(img_path))
    img_resized = cv2.resize(img, (480, 480))
    
    # Save calibration image
    calib_path = calibration_dir / f"calib_{i:03d}.jpg"
    cv2.imwrite(str(calib_path), img_resized)

print(f"Created {len(val_images)} calibration images in {calibration_dir}")

# Provide summary of export options
print("\n" + "="*60)
print("MODEL EXPORT SUMMARY")
print("="*60)

print(f"Original PyTorch model: {best_model_path}")
print(f"Model size: {best_model_path.stat().st_size / (1024*1024):.2f} MB")

if onnx_export_path and onnx_export_path.exists():
    print(f"\nONNX model: {onnx_export_path}")
    print(f"ONNX size: {onnx_export_path.stat().st_size / (1024*1024):.2f} MB")
    print("✓ Ready for onnxruntime inference")

if ncnn_success:
    print(f"\nNCNN model: {ncnn_param_path}, {ncnn_bin_path}")
    print("✓ Ready for NCNN inference")
else:
    print(f"\nNCNN conversion: Manual steps required")
    print("📝 See conversion instructions above")

print(f"\nCalibration data: {calibration_dir}")
print(f"Calibration images: {len(list(calibration_dir.glob('*.jpg')))}")

print("\n" + "="*60)

## 5. Performance Validation and Deployment

Let's validate the performance of our exported models and provide guidance for deploying them to the Raspberry Pi 5 pipeline.

### Model Comparison
We'll compare the performance characteristics of different model formats:
- **PyTorch (.pt)**: Original format, full precision
- **ONNX (.onnx)**: Optimized for CPU inference
- **NCNN (.param/.bin)**: Mobile/embedded optimized with int8 quantization

In [None]:
# Performance comparison function
def benchmark_model(model_path, model_type, num_runs=20):
    """Benchmark model inference performance."""
    import time
    
    # Prepare test image
    test_img = np.random.randn(1, 3, 480, 480).astype(np.float32)
    
    if model_type == 'pytorch':
        model = YOLO(str(model_path))
        
        # Warmup
        for _ in range(3):
            model(test_img)
        
        # Benchmark
        start_time = time.time()
        for _ in range(num_runs):
            results = model(test_img)
        total_time = time.time() - start_time
        
    elif model_type == 'onnx':
        try:
            import onnxruntime as ort
            session = ort.InferenceSession(str(model_path))
            input_name = session.get_inputs()[0].name
            
            # Warmup
            for _ in range(3):
                session.run(None, {input_name: test_img})
            
            # Benchmark
            start_time = time.time()
            for _ in range(num_runs):
                outputs = session.run(None, {input_name: test_img})
            total_time = time.time() - start_time
            
        except ImportError:
            print("onnxruntime not available")
            return None
            
    else:
        print(f"Model type {model_type} not supported for benchmarking")
        return None
    
    avg_time = total_time / num_runs
    fps = 1.0 / avg_time
    
    return {
        'avg_inference_time': avg_time,
        'fps': fps,
        'total_time': total_time,
        'num_runs': num_runs
    }

# Benchmark available models
print("Benchmarking model performance...")
print("Note: This simulates inference on current hardware, not Pi 5")
print("="*60)

models_to_test = [
    (best_model_path, 'pytorch', 'Custom PyTorch'),
]

if onnx_export_path and onnx_export_path.exists():
    models_to_test.append((onnx_export_path, 'onnx', 'Custom ONNX'))

benchmark_results = {}

for model_path, model_type, display_name in models_to_test:
    print(f"\nBenchmarking {display_name}...")
    result = benchmark_model(model_path, model_type)
    
    if result:
        benchmark_results[display_name] = result
        print(f"  Average inference time: {result['avg_inference_time']:.4f}s")
        print(f"  Estimated FPS: {result['fps']:.2f}")
        print(f"  File size: {model_path.stat().st_size / (1024*1024):.2f} MB")

# Create performance comparison chart
if benchmark_results:
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
    
    models = list(benchmark_results.keys())
    fps_values = [benchmark_results[model]['fps'] for model in models]
    inference_times = [benchmark_results[model]['avg_inference_time'] * 1000 for model in models]  # Convert to ms
    
    # FPS comparison
    ax1.bar(models, fps_values, color=['blue', 'orange'][:len(models)])
    ax1.set_ylabel('FPS')
    ax1.set_title('Model FPS Comparison')
    ax1.tick_params(axis='x', rotation=45)
    
    # Inference time comparison
    ax2.bar(models, inference_times, color=['blue', 'orange'][:len(models)])
    ax2.set_ylabel('Inference Time (ms)')
    ax2.set_title('Model Inference Time Comparison')
    ax2.tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()

# Expected Pi 5 performance estimates
print("\n" + "="*60)
print("RASPBERRY PI 5 PERFORMANCE ESTIMATES")
print("="*60)

pi5_estimates = {
    'Custom PyTorch': {'fps_range': '8-12', 'memory': '2-3 GB'},
    'Custom ONNX': {'fps_range': '12-18', 'memory': '1.5-2.5 GB'},
    'Custom NCNN int8': {'fps_range': '15-25', 'memory': '1-2 GB'}
}

for model_name, estimates in pi5_estimates.items():
    print(f"\n{model_name}:")
    print(f"  Expected FPS: {estimates['fps_range']}")
    print(f"  Memory usage: {estimates['memory']}")

print(f"\nFactors affecting Pi 5 performance:")
print(f"- CPU temperature (thermal throttling above 80°C)")
print(f"- Power supply quality (5V/5A recommended)")
print(f"- USB bandwidth for camera (USB 3.0 required)")
print(f"- Background processes and system load")
print(f"- Model complexity and input resolution")

# Generate deployment instructions
deployment_instructions = f"""
# DEPLOYMENT TO RASPBERRY PI 5

## 1. Copy Models to Pi 5
```bash
# Copy trained model files to Pi 5
scp {best_model_path} pi@your-pi5:/home/pi/real-time-object-detection/models/
"""

if onnx_export_path and onnx_export_path.exists():
    deployment_instructions += f"scp {onnx_export_path} pi@your-pi5:/home/pi/real-time-object-detection/models/\n"

deployment_instructions += """
```

## 2. Update Detection Pipeline
Edit `src/detector.py` to use your custom model:

```python
# In Detector.__init__()
self.model_path = "models/custom_5class_480.onnx"  # Use ONNX for best performance
self.class_names = ['person', 'vehicle', 'animal', 'device', 'furniture']
```

## 3. Run Custom Detection
```bash
# Activate environment
source venv/bin/activate

# Run with custom model
python3 src/main.py --model models/custom_5class_480.onnx --confidence 0.6

# Or with PyTorch model
python3 src/main.py --model models/best.pt --confidence 0.6
```

## 4. Performance Optimization
- Use ONNX model for best CPU performance
- Reduce input resolution if needed: `--width 320 --height 240`
- Increase confidence threshold: `--confidence 0.7`
- Enable headless mode for maximum performance: `--headless`
"""

print(deployment_instructions)

# Save deployment instructions to file
deployment_file = export_dir / 'deployment_instructions.md'
with open(deployment_file, 'w') as f:
    f.write(deployment_instructions)

print(f"\nDeployment instructions saved to: {deployment_file}")

## 6. Summary and Next Steps

### What We Accomplished

1. **Created a Custom Dataset**: Generated a synthetic 5-class dataset for demonstration
2. **Fine-tuned YOLOv8-nano**: Trained on custom classes with optimized parameters
3. **Exported to Multiple Formats**: 
   - PyTorch (.pt) for development and testing
   - ONNX (.onnx) for optimized CPU inference on Pi 5
   - NCNN (.param/.bin) for maximum mobile optimization
4. **Performance Validation**: Benchmarked models and estimated Pi 5 performance
5. **Deployment Ready**: Provided complete deployment instructions

### Model Performance Summary

| Format | Size | Expected Pi 5 FPS | Memory Usage | Best For |
|--------|------|-------------------|--------------|----------|
| PyTorch | ~12 MB | 8-12 FPS | 2-3 GB | Development |
| ONNX | ~6 MB | 12-18 FPS | 1.5-2.5 GB | Production |
| NCNN int8 | ~3 MB | 15-25 FPS | 1-2 GB | Maximum Performance |

### Next Steps

1. **Deploy to Pi 5**: Copy models and update the detection pipeline
2. **Real-world Testing**: Test with actual camera data and adjust thresholds
3. **Dataset Improvement**: Replace synthetic data with real annotated images
4. **Hyperparameter Tuning**: Optimize training parameters for your specific use case
5. **Advanced Optimizations**: 
   - Model pruning to remove unnecessary parameters
   - Knowledge distillation for smaller models
   - TensorRT optimization (if using Jetson instead of Pi 5)

### Troubleshooting Tips

- **Low Training Accuracy**: Increase dataset size, check label quality, adjust learning rate
- **Export Failures**: Ensure compatible PyTorch/ONNX versions, check CUDA availability
- **Pi 5 Performance Issues**: Reduce input resolution, increase confidence threshold, check thermal throttling
- **Memory Errors**: Use smaller batch sizes, enable swap, reduce model complexity

### Resources for Further Learning

- [Ultralytics YOLOv8 Documentation](https://docs.ultralytics.com/)
- [ONNX Model Optimization](https://onnxruntime.ai/docs/performance/model-optimizations/)
- [NCNN Framework Guide](https://github.com/Tencent/ncnn/wiki)
- [Raspberry Pi 5 Performance Optimization](https://www.raspberrypi.org/documentation/)