In [None]:
# Install all required packages for this notebook
!pip install torch torchvision timm scikit-learn pandas seaborn matplotlib tqdm

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, SubsetRandomSampler
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report,
    roc_auc_score,
    cohen_kappa_score
)
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import json
import os
from tqdm import tqdm
import random
import time
import re
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Add missing imports
from torchvision import transforms, datasets
import timm

# --- Configuration ---
DATA_DIR = "dataset"
TRAINED_MODELS_DIR = "trained_models"
OUTPUT_DIR = "evaluation_results"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 64  # Increased for faster evaluation
NUM_WORKERS = 8  # Increased for faster data loading

print(f"Using device: {DEVICE}")
print(f"Available CPU cores: {os.cpu_count()}")

# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(f"{OUTPUT_DIR}/confusion_matrices", exist_ok=True)
os.makedirs(f"{OUTPUT_DIR}/metrics", exist_ok=True)
os.makedirs(f"{OUTPUT_DIR}/visualizations", exist_ok=True)

# --- Helper Functions ---
def extract_image_size_from_model_name(model_name):
    """Extract image size from model name if present, otherwise return default."""
    # Look for patterns like _224, _240, _256, etc.
    size_match = re.search(r'_(\d{3})(?:_|\.)', model_name)
    if size_match:
        return int(size_match.group(1))
    
    if '224' in model_name:
        return 224

    else:
        return 128  # Default fallback

def get_model_transforms(img_size):
    """Get appropriate transforms for given image size."""
    # Resize to slightly larger, then crop to exact size
    resize_size = int(img_size * 1.14)  # Standard practice
    
    return transforms.Compose([
        transforms.Resize(resize_size),
        transforms.CenterCrop(img_size),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])

def get_available_models():
    """Get all available model files from trained_models directory."""
    model_files = []
    if os.path.exists(TRAINED_MODELS_DIR):
        for file in os.listdir(TRAINED_MODELS_DIR):
            if file.endswith('_best.pth'):
                model_name = file.replace('_best.pth', '')
                model_files.append(model_name)
    return sorted(model_files)

def calculate_model_size(model):
    """Calculate model size in MB."""
    param_size = 0
    param_count = 0
    for param in model.parameters():
        param_count += param.numel()
        param_size += param.numel() * param.element_size()
    
    buffer_size = 0
    for buffer in model.buffers():
        buffer_size += buffer.numel() * buffer.element_size()
    
    size_mb = (param_size + buffer_size) / 1024 / 1024
    return size_mb, param_count

def calculate_flops_estimate(model, img_size):
    """Rough FLOP estimation based on model parameters and input size."""
    param_count = sum(p.numel() for p in model.parameters())
    # Very rough estimation: 2 * params * input_pixels
    flops = 2 * param_count * img_size * img_size
    return flops / 1e9  # Return in GFLOPs

def measure_inference_time(model, dataloader, device, num_batches=10):
    """Measure average inference time."""
    model.eval()
    times = []
    
    with torch.no_grad():
        for i, (images, _) in enumerate(dataloader):
            if i >= num_batches:
                break
            
            images = images.to(device, non_blocking=True)
            
            # Warm up
            if i == 0:
                _ = model(images)
                continue
            
            start_time = time.time()
            _ = model(images)
            if device.type == 'cuda':
                torch.cuda.synchronize()
            end_time = time.time()
            
            times.append(end_time - start_time)
    
    avg_time = np.mean(times)
    return avg_time * 1000  # Return in milliseconds

# --- Evaluation Functions ---
def evaluate_model_comprehensive(model, val_loader, model_name, img_size):
    """Comprehensive model evaluation with all metrics."""
    print(f"Evaluating {model_name}...")
    
    model.eval()
    all_preds = []
    all_labels = []
    all_probs = []
    
    # Measure inference time
    inference_time = measure_inference_time(model, val_loader, DEVICE)
    
    # Calculate model metrics
    model_size_mb, param_count = calculate_model_size(model)
    flops_estimate = calculate_flops_estimate(model, img_size)
    
    # Evaluation loop
    with torch.no_grad():
        for images, labels in tqdm(val_loader, desc=f"Evaluating {model_name}"):
            images = images.to(DEVICE, non_blocking=True)
            labels = labels.to(DEVICE, non_blocking=True)
            
            outputs = model(images)
            probs = torch.softmax(outputs, dim=1)
            _, preds = torch.max(outputs, 1)
            
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            all_probs.extend(probs.cpu().numpy())

    # Calculate all metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average="weighted", zero_division=0)
    recall = recall_score(all_labels, all_preds, average="weighted", zero_division=0)
    f1 = f1_score(all_labels, all_preds, average="weighted", zero_division=0)
    kappa = cohen_kappa_score(all_labels, all_preds)
    
    # Per-class metrics
    precision_per_class = precision_score(all_labels, all_preds, average=None, zero_division=0)
    recall_per_class = recall_score(all_labels, all_preds, average=None, zero_division=0)
    f1_per_class = f1_score(all_labels, all_preds, average=None, zero_division=0)
    
    # Confusion matrix
    cm = confusion_matrix(all_labels, all_preds)
    
    # Classification report
    cls_report = classification_report(all_labels, all_preds, 
                                       target_names=list(full_dataset.class_to_idx.keys()), 
                                       output_dict=True, zero_division=0)
    
    # ROC-AUC
    try:
        roc_auc = roc_auc_score(all_labels, all_probs, multi_class="ovo", average="weighted")
    except:
        roc_auc = None
    
    # Calculate edge deployment metrics
    # Efficiency score (higher is better): accuracy / (model_size * inference_time)
    efficiency_score = accuracy / (model_size_mb * inference_time) if inference_time > 0 else 0
    
    # Throughput (images per second)
    throughput = (BATCH_SIZE * 1000) / inference_time if inference_time > 0 else 0
    
    return {
        "model_name": model_name,
        "image_size": img_size,
        "accuracy": float(accuracy),
        "precision": float(precision),
        "recall": float(recall),
        "f1_score": float(f1),
        "cohen_kappa": float(kappa),
        "roc_auc": float(roc_auc) if roc_auc else None,
        "model_size_mb": float(model_size_mb),
        "parameter_count": int(param_count),
        "estimated_flops_gflops": float(flops_estimate),
        "inference_time_ms": float(inference_time),
        "throughput_fps": float(throughput),
        "efficiency_score": float(efficiency_score),
        "confusion_matrix": cm.tolist(),
        "classification_report": cls_report,
        "per_class_precision": precision_per_class.tolist(),
        "per_class_recall": recall_per_class.tolist(),
        "per_class_f1": f1_per_class.tolist(),
        "predictions": [int(x) for x in all_preds],
        "true_labels": [int(x) for x in all_labels],
        "class_probabilities": [[float(y) for y in x] for x in all_probs]
    }

# --- Visualization Functions ---
def plot_confusion_matrix(cm, class_names, model_name):
    """Plot and save confusion matrix."""
    plt.figure(figsize=(12, 10))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", 
                xticklabels=class_names, 
                yticklabels=class_names,
                cbar_kws={'label': 'Count'})
    plt.title(f"Confusion Matrix - {model_name}", fontsize=16, fontweight='bold')
    plt.xlabel("Predicted Class", fontsize=12)
    plt.ylabel("True Class", fontsize=12)
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.savefig(f"{OUTPUT_DIR}/confusion_matrices/{model_name}_confusion_matrix.png", 
                dpi=300, bbox_inches='tight')
    plt.close()

def plot_model_comparison(results_df):
    """Create comprehensive comparison visualizations."""
    
    # 1. Performance Metrics Comparison
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # Accuracy comparison
    results_df_sorted = results_df.sort_values('accuracy', ascending=True)
    axes[0,0].barh(results_df_sorted['model_name'], results_df_sorted['accuracy'])
    axes[0,0].set_title('Model Accuracy Comparison', fontweight='bold')
    axes[0,0].set_xlabel('Accuracy')
    
    # F1-Score comparison
    results_df_sorted = results_df.sort_values('f1_score', ascending=True)
    axes[0,1].barh(results_df_sorted['model_name'], results_df_sorted['f1_score'])
    axes[0,1].set_title('Model F1-Score Comparison', fontweight='bold')
    axes[0,1].set_xlabel('F1-Score')
    
    # Model Size comparison
    results_df_sorted = results_df.sort_values('model_size_mb', ascending=True)
    axes[1,0].barh(results_df_sorted['model_name'], results_df_sorted['model_size_mb'])
    axes[1,0].set_title('Model Size Comparison', fontweight='bold')
    axes[1,0].set_xlabel('Model Size (MB)')
    
    # Inference Time comparison
    results_df_sorted = results_df.sort_values('inference_time_ms', ascending=True)
    axes[1,1].barh(results_df_sorted['model_name'], results_df_sorted['inference_time_ms'])
    axes[1,1].set_title('Inference Time Comparison', fontweight='bold')
    axes[1,1].set_xlabel('Inference Time (ms)')
    
    plt.tight_layout()
    plt.savefig(f"{OUTPUT_DIR}/visualizations/model_comparison_metrics.png", 
                dpi=300, bbox_inches='tight')
    plt.close()
    
    # 2. Accuracy vs Efficiency Scatter Plot
    plt.figure(figsize=(12, 8))
    scatter = plt.scatter(results_df['model_size_mb'], results_df['accuracy'], 
                          c=results_df['inference_time_ms'], cmap='viridis', 
                          s=100, alpha=0.7)
    
    # Add model names as labels
    for i, model in enumerate(results_df['model_name']):
        plt.annotate(model, (results_df.iloc[i]['model_size_mb'], results_df.iloc[i]['accuracy']),
                     xytext=(5, 5), textcoords='offset points', fontsize=8)
    
    plt.colorbar(scatter, label='Inference Time (ms)')
    plt.xlabel('Model Size (MB)')
    plt.ylabel('Accuracy')
    plt.title('Model Accuracy vs Size (Color = Inference Time)', fontweight='bold')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(f"{OUTPUT_DIR}/visualizations/accuracy_vs_size_scatter.png", 
                dpi=300, bbox_inches='tight')
    plt.close()
    
    # 3. Efficiency Score Ranking
    plt.figure(figsize=(12, 8))
    results_df_sorted = results_df.sort_values('efficiency_score', ascending=True)
    bars = plt.barh(results_df_sorted['model_name'], results_df_sorted['efficiency_score'])
    plt.title('Model Efficiency Score Ranking\n(Accuracy / (Model Size × Inference Time))', 
              fontweight='bold')
    plt.xlabel('Efficiency Score')
    plt.grid(True, alpha=0.3)
    
    # Color bars based on efficiency
    colors = plt.cm.RdYlGn(np.linspace(0.3, 0.9, len(bars)))
    for bar, color in zip(bars, colors):
        bar.set_color(color)
    
    plt.tight_layout()
    plt.savefig(f"{OUTPUT_DIR}/visualizations/efficiency_ranking.png", 
                dpi=300, bbox_inches='tight')
    plt.close()
    
    # 4. Throughput vs Accuracy
    plt.figure(figsize=(12, 8))
    scatter = plt.scatter(results_df['throughput_fps'], results_df['accuracy'], 
                          c=results_df['model_size_mb'], cmap='plasma', 
                          s=100, alpha=0.7)
    
    for i, model in enumerate(results_df['model_name']):
        plt.annotate(model, (results_df.iloc[i]['throughput_fps'], results_df.iloc[i]['accuracy']),
                     xytext=(5, 5), textcoords='offset points', fontsize=8)
    
    plt.colorbar(scatter, label='Model Size (MB)')
    plt.xlabel('Throughput (FPS)')
    plt.ylabel('Accuracy')
    plt.title('Model Throughput vs Accuracy (Color = Model Size)', fontweight='bold')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(f"{OUTPUT_DIR}/visualizations/throughput_vs_accuracy.png", 
                dpi=300, bbox_inches='tight')
    plt.close()

def create_detailed_report(results_df, class_names):
    """Create a detailed markdown report."""
    report = f"""# Comparative Analysis of Lightweight Vision Models for Tomato Disease Classification

## Executive Summary
This report presents a comprehensive evaluation of {len(results_df)} lightweight vision models for tomato disease classification, focusing on edge deployment capabilities.

## Dataset Information
- **Classes**: {len(class_names)} disease categories
- **Class Names**: {', '.join(class_names)}
- **Evaluation Split**: 20% validation set (consistent across all models)

## Model Performance Summary

### Top 5 Models by Accuracy
{results_df.nlargest(5, 'accuracy')[['model_name', 'accuracy', 'f1_score', 'model_size_mb', 'inference_time_ms']].to_markdown(index=False, floatfmt='.4f')}

### Top 5 Models by Efficiency Score
{results_df.nlargest(5, 'efficiency_score')[['model_name', 'efficiency_score', 'accuracy', 'model_size_mb', 'inference_time_ms']].to_markdown(index=False, floatfmt='.4f')}

### Top 5 Fastest Models (Lowest Inference Time)
{results_df.nsmallest(5, 'inference_time_ms')[['model_name', 'inference_time_ms', 'throughput_fps', 'accuracy', 'model_size_mb']].to_markdown(index=False, floatfmt='.4f')}

### Top 5 Smallest Models
{results_df.nsmallest(5, 'model_size_mb')[['model_name', 'model_size_mb', 'parameter_count', 'accuracy', 'inference_time_ms']].to_markdown(index=False, floatfmt='.4f')}

## Complete Results
{results_df.to_markdown(index=False, floatfmt='.4f')}

## Key Findings

1. **Best Overall Performance**: {results_df.loc[results_df['accuracy'].idxmax(), 'model_name']} achieved the highest accuracy of {results_df['accuracy'].max():.4f}

2. **Most Efficient**: {results_df.loc[results_df['efficiency_score'].idxmax(), 'model_name']} offers the best efficiency score of {results_df['efficiency_score'].max():.4f}

3. **Fastest Inference**: {results_df.loc[results_df['inference_time_ms'].idxmin(), 'model_name']} has the lowest inference time of {results_df['inference_time_ms'].min():.2f}ms

4. **Smallest Model**: {results_df.loc[results_df['model_size_mb'].idxmin(), 'model_name']} is the most compact at {results_df['model_size_mb'].min():.2f}MB

## Recommendations for Edge Deployment

Based on the analysis, the following models are recommended for different deployment scenarios:

- **High Accuracy Priority**: {results_df.nlargest(1, 'accuracy')['model_name'].iloc[0]}
- **Balanced Performance**: {results_df.nlargest(1, 'efficiency_score')['model_name'].iloc[0]}
- **Speed Priority**: {results_df.nsmallest(1, 'inference_time_ms')['model_name'].iloc[0]}
- **Memory Constrained**: {results_df.nsmallest(1, 'model_size_mb')['model_name'].iloc[0]}

## Methodology

All models were evaluated using:
- Consistent train/validation splits (80/20)
- Same preprocessing pipeline
- Identical evaluation metrics
- Hardware: {DEVICE}
- Batch size: {BATCH_SIZE}

Generated on: {time.strftime('%Y-%m-%d %H:%M:%S')}
"""
    
    with open(f"{OUTPUT_DIR}/evaluation_report.md", "w") as f:
        f.write(report)

# --- Main Execution ---
def main():
    # Load dataset and class info
    global full_dataset
    full_dataset = datasets.ImageFolder(root=DATA_DIR)
    class_names = list(full_dataset.class_to_idx.keys())
    NUM_CLASSES = len(class_names)
    
    print(f"Found {NUM_CLASSES} classes: {class_names}")
    
    # Create validation split (same as training)
    random.seed(42)
    indices = list(range(len(full_dataset)))
    random.shuffle(indices)
    split_point = int(0.8 * len(indices))
    val_indices = indices[split_point:]
    
    print(f"Validation set: {len(val_indices)} samples")
    
    # Get all available models
    model_names = get_available_models()
    print(f"Found {len(model_names)} trained models: {model_names}")
    
    if not model_names:
        print("No trained models found! Please check the trained_models directory.")
        return
    
    all_results = []
    
    # Evaluate each model
    for model_name in model_names:
        try:
            print(f"\n{'='*60}")
            print(f"Processing: {model_name}")
            print(f"{'='*60}")
            
            # --- NEW: Functionality to skip existing evaluations ---
            json_output_path = f"{OUTPUT_DIR}/metrics/{model_name}_metrics.json"
            if os.path.exists(json_output_path):
                print(f"Results for {model_name} already exist. Skipping evaluation, loading from file.")
                with open(json_output_path, 'r') as f:
                    results = json.load(f)
                all_results.append(results)
                # Still generate the confusion matrix plot if it's missing
                cm_path = f"{OUTPUT_DIR}/confusion_matrices/{model_name}_confusion_matrix.png"
                if not os.path.exists(cm_path):
                    plot_confusion_matrix(np.array(results["confusion_matrix"]), 
                                          class_names, model_name)
                continue # Move to the next model
            # --- END NEW ---

            # Extract image size from model name
            img_size = extract_image_size_from_model_name(model_name)
            print(f"Using image size: {img_size}")
            
            # Create transforms and dataset for this model
            model_transforms = get_model_transforms(img_size)
            model_dataset = datasets.ImageFolder(root=DATA_DIR, transform=model_transforms)
            
            # Create dataloader with validation split
            val_sampler = SubsetRandomSampler(val_indices)
            val_loader = DataLoader(model_dataset, batch_size=BATCH_SIZE, 
                                    sampler=val_sampler, num_workers=NUM_WORKERS, 
                                    pin_memory=True)
            
            # Load model
            model = timm.create_model(model_name, pretrained=False, num_classes=NUM_CLASSES)
            model_path = f"{TRAINED_MODELS_DIR}/{model_name}_best.pth"
            
            if os.path.exists(model_path):
                model.load_state_dict(torch.load(model_path, map_location=DEVICE))
                model = model.to(DEVICE)
                
                # Evaluate model
                results = evaluate_model_comprehensive(model, val_loader, model_name, img_size)
                all_results.append(results)
                
                # Save individual results
                with open(json_output_path, "w") as f: # MODIFIED: Use pre-defined path
                    json.dump(results, f, indent=4)
                
                # Create confusion matrix
                plot_confusion_matrix(np.array(results["confusion_matrix"]), 
                                      class_names, model_name)
                
                # Print summary
                print(f"\nResults Summary for {model_name}:")
                print(f"  Accuracy: {results['accuracy']:.4f}")
                print(f"  F1-Score: {results['f1_score']:.4f}")
                print(f"  Model Size: {results['model_size_mb']:.2f} MB")
                print(f"  Inference Time: {results['inference_time_ms']:.2f} ms")
                print(f"  Throughput: {results['throughput_fps']:.1f} FPS")
                print(f"  Efficiency Score: {results['efficiency_score']:.4f}")
                
                # Clean up memory
                del model
                torch.cuda.empty_cache() if torch.cuda.is_available() else None
                
            else:
                print(f"Model weights not found: {model_path}")
                
        except Exception as e:
            print(f"Error processing {model_name}: {str(e)}")
            continue
    
    if not all_results:
        print("No models were successfully evaluated!")
        return
    
    # Create comprehensive results DataFrame
    results_df = pd.DataFrame(all_results)
    
    # Save complete results
    results_df.to_csv(f"{OUTPUT_DIR}/complete_results.csv", index=False)
    results_df.to_excel(f"{OUTPUT_DIR}/complete_results.xlsx", index=False)
    
    # Create visualizations
    print("\nGenerating visualizations...")
    plot_model_comparison(results_df)
    
    # Create detailed report
    print("Creating detailed report...")
    create_detailed_report(results_df, class_names)
    
    # Print final summary
    print(f"\n{'='*80}")
    print("EVALUATION COMPLETE!")
    print(f"{'='*80}")
    print(f"Total models analyzed: {len(all_results)}") # MODIFIED: Changed "evaluated" to "analyzed"
    print(f"Results saved to: {OUTPUT_DIR}/")
    print(f"Best accuracy: {results_df['accuracy'].max():.4f} ({results_df.loc[results_df['accuracy'].idxmax(), 'model_name']})")
    print(f"Most efficient: {results_df['efficiency_score'].max():.4f} ({results_df.loc[results_df['efficiency_score'].idxmax(), 'model_name']})")
    print(f"Fastest inference: {results_df['inference_time_ms'].min():.2f}ms ({results_df.loc[results_df['inference_time_ms'].idxmin(), 'model_name']})")
    print(f"Smallest model: {results_df['model_size_mb'].min():.2f}MB ({results_df.loc[results_df['model_size_mb'].idxmin(), 'model_name']})")
    
    print(f"\nCheck the '{OUTPUT_DIR}' folder for:")
    print("- complete_results.csv/xlsx: All metrics in spreadsheet format")
    print("- evaluation_report.md: Detailed markdown report")
    print("- confusion_matrices/: Individual confusion matrices")
    print("- visualizations/: Comparison charts and plots")
    print("- metrics/: Individual model JSON results")

if __name__ == "__main__":
    main()