In [None]:
# ===================================================================
# 🚀 TEST RAPIDO: Verifica configurazione e dipendenze
# ===================================================================

print("🔍 TEST CONFIGURAZIONE NOTEBOOK 05")
print("="*50)

# 1. Test import base
print("1️⃣ Import TensorFlow...")
try:
    import tensorflow as tf
    print(f"   ✅ TensorFlow {tf.__version__}")
except Exception as e:
    print(f"   ❌ Errore TensorFlow: {e}")

# 2. Test altre librerie
print("2️⃣ Import altre librerie...")
try:
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    import seaborn as sns
    from sklearn.metrics import classification_report, confusion_matrix
    print("   ✅ Tutte le librerie caricate")
except Exception as e:
    print(f"   ❌ Errore import: {e}")

# 3. Test struttura progetto
print("3️⃣ Verifica struttura progetto...")
import os
current_dir = os.getcwd()
print(f"   Directory corrente: {current_dir}")

# Controlla se siamo in un notebook o in Colab
expected_paths = [
    "../data/processed",
    "../models", 
    "../src"
]

for path in expected_paths:
    if os.path.exists(path):
        print(f"   ✅ {path}")
    else:
        print(f"   ⚠️ {path} (potrebbe essere normale se in Colab)")

# 4. Test GPU/CPU
print("4️⃣ Verifica dispositivi di calcolo...")
print(f"   GPU disponibili: {len(tf.config.list_physical_devices('GPU'))}")
print(f"   CPU disponibili: {len(tf.config.list_physical_devices('CPU'))}")

print("\n🎯 Setup completato! Procedi con le celle successive.")
print("="*50)

# Analisi e Valutazione Modello CNN

In questo notebook analizziamo in dettaglio le performance del modello CNN addestrato per la classificazione DeepWeeds.

## Obiettivi:
- Valutare le performance sui dati di test
- Analizzare le curve di training e overfitting
- Generare confusion matrix e classification report
- Analisi degli errori e confidence scores
- Visualizzazioni avanzate delle performance

# 1. Setup e Caricamento Modello

Importiamo le librerie e carichiamo il modello addestrato più recente.

In [1]:
# Import essenziali
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd
import os
import json
from datetime import datetime
import glob

# Configurazione grafica
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print(f"TensorFlow version: {tf.__version__}")
print(f"Matplotlib version: {plt.matplotlib.__version__}")

TensorFlow version: 2.19.0
Matplotlib version: 3.10.3


In [None]:
# COMPATIBILITÀ GOOGLE COLAB - VERSIONE SEMPLIFICATA
import os
import sys
from pathlib import Path

# Rileva se siamo su Google Colab
try:
    import google.colab  # type: ignore
    IN_COLAB = True
    print("🔍 Ambiente rilevato: Google Colab")
except ImportError:
    IN_COLAB = False
    print("🔍 Ambiente rilevato: Locale")

if IN_COLAB:
    print("📱 Configurazione Google Colab...")
    
    # Setup semplice senza Google Drive
    PROJECT_PATH = '/content/Progetto-Deep-Learning'
    
    # Clona il repository se non esiste
    if not os.path.exists(PROJECT_PATH):
        print("📥 Clonazione repository...")
        os.system(f'git clone https://github.com/Ame-76/Progetto-Deep-Learning.git {PROJECT_PATH}')
        print("✅ Repository clonato!")
    
    # Configura environment
    os.chdir(PROJECT_PATH)
    sys.path.append(f"{PROJECT_PATH}/src")
    
    # Per notebook 05: prova a scaricare modelli esistenti
    print("📥 Tentativo download modelli esistenti...")
    print("💡 Se non ci sono modelli, addestra prima con notebook 04")
    
    MODELS_DIR = f"{PROJECT_PATH}/models"
    DATA_DIR = f"{PROJECT_PATH}/data"
    
    print("✅ Setup Colab completato!")
    
else:
    # Setup locale
    PROJECT_PATH = str(Path.cwd().parent)
    MODELS_DIR = f"{PROJECT_PATH}/models"
    DATA_DIR = f"{PROJECT_PATH}/data"
    print("✅ Setup locale completato!")

print(f"📁 Progetto: {PROJECT_PATH}")
print(f"📁 Models: {MODELS_DIR}")
print(f"📁 Data: {DATA_DIR}")

# 2. Caricamento Dataset e Modello

Carichiamo i dataset preprocessati e il modello addestrato più recente.

In [None]:
print("=== CARICAMENTO DATASET E MODELLO ===")

# Carica dataset preprocessati
# Usa path dinamici se configurati (compatibilità Colab)  
if 'DATA_DIR' in globals():
    processed_dir = f"{DATA_DIR}/processed"
else:
    processed_dir = "../data/processed"

if (os.path.exists(f"{processed_dir}/train_dataset") and 
    os.path.exists(f"{processed_dir}/val_dataset") and 
    os.path.exists(f"{processed_dir}/test_dataset")):
    
    print("✅ Caricamento dataset preprocessati...")
    train_dataset = tf.data.experimental.load(f"{processed_dir}/train_dataset")
    val_dataset = tf.data.experimental.load(f"{processed_dir}/val_dataset")
    test_dataset = tf.data.experimental.load(f"{processed_dir}/test_dataset")
    
    # Estrai informazioni dal dataset
    sample_batch = train_dataset.take(1)
    for images, labels in sample_batch:
        INPUT_SHAPE = images.shape[1:]
        
        # Calcola numero di classi
        all_labels = set()
        for dataset in [train_dataset, val_dataset, test_dataset]:
            for _, batch_labels in dataset:
                all_labels.update(batch_labels.numpy())
        
        num_classes = len(all_labels)
        class_names = [f"Class_{i}" for i in range(num_classes)]
        break
        
    print(f"   Dataset caricati: {num_classes} classi, shape {INPUT_SHAPE}")
    
else:
    print("⚠️ Dataset preprocessati non trovati!")
    print("💡 Esegui prima 03-preprocessing.ipynb")
    
    # Crea dataset demo per continuare l'analisi senza errori
    print("🔄 Creazione dataset demo per test...")
    
    # Dataset demo con dati casuali
    import numpy as np
    def create_demo_dataset():
        # Simula dataset DeepWeeds
        batch_size = 16
        height, width, channels = 256, 256, 3
        num_classes = 9
        num_samples = 100
        
        # Genera dati casuali
        images = np.random.random((num_samples, height, width, channels)).astype(np.float32)
        labels = np.random.randint(0, num_classes, (num_samples,))
        
        # Crea TensorFlow dataset
        dataset = tf.data.Dataset.from_tensor_slices((images, labels))
        dataset = dataset.batch(batch_size)
        
        return dataset, (height, width, channels), num_classes
    
    train_dataset, INPUT_SHAPE, num_classes = create_demo_dataset()
    val_dataset = train_dataset
    test_dataset = train_dataset
    class_names = [f"Demo_Class_{i}" for i in range(num_classes)]
    
    print(f"   Dataset demo creato: {num_classes} classi, shape {INPUT_SHAPE}")

# Trova il modello più recente
# Usa path dinamici se configurati (compatibilità Colab)
if 'MODELS_DIR' in globals():
    models_dir = MODELS_DIR
else:
    models_dir = "../models"

model_dirs = glob.glob(f"{models_dir}/cnn_from_scratch_*")

if not model_dirs:
    print("⚠️ Nessun modello CNN trovato!")
    print("💡 Opzioni disponibili:")
    print("   1. Esegui 04-cnn-from-scratch.ipynb per addestrare un modello")
    print("   2. Scarica un modello da Colab e estrailo in models/")
    print("   3. Continua con un modello demo per testare il notebook")
    
    # Crea un modello demo per continuare
    print("\n🔄 Creazione modello demo per test del notebook...")
    
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Conv2D, MaxPooling2D, GlobalAveragePooling2D, Dense
    
    # Modello semplice per demo
    demo_model = Sequential([
        Conv2D(32, 3, activation='relu', input_shape=INPUT_SHAPE),
        MaxPooling2D(),
        Conv2D(64, 3, activation='relu'),
        MaxPooling2D(),
        GlobalAveragePooling2D(),
        Dense(num_classes, activation='softmax')
    ])
    
    demo_model.compile(optimizer='adam',
                      loss='sparse_categorical_crossentropy',
                      metrics=['accuracy'])
    
    # Simula alcune metriche per demo
    model = demo_model
    latest_model_dir = f"{models_dir}/demo_model"
    training_history = None
    training_info = None
    
    print("✅ Modello demo creato per test del notebook!")
    print("⚠️ Questo è solo per testare il codice di analisi")
    
else:
    # Carica modello addestrato esistente
    latest_model_dir = max(model_dirs, key=os.path.getctime)
    model_path = f"{latest_model_dir}/checkpoints/best_model.h5"
    
    print(f"📥 Caricamento modello da: {latest_model_dir}")
    
    if os.path.exists(model_path):
        model = tf.keras.models.load_model(model_path)
        print("✅ Modello caricato con successo!")
    else:
        print(f"❌ Modello non trovato in: {model_path}")
        print("💡 Possibili cause:")
        print("   - Training non completato")
        print("   - File corrotti")
        print("   - Directory vuota")
        
        # Lista contenuto directory per debug
        print(f"\n🔍 Contenuto directory {latest_model_dir}:")
        if os.path.exists(latest_model_dir):
            for root, dirs, files in os.walk(latest_model_dir):
                level = root.replace(latest_model_dir, '').count(os.sep)
                indent = ' ' * 2 * level
                print(f"{indent}{os.path.basename(root)}/")
                subindent = ' ' * 2 * (level + 1)
                for file in files:
                    print(f"{subindent}{file}")
        
        raise FileNotFoundError(f"Modello non trovato. Esegui prima 04-cnn-from-scratch.ipynb")
    
    # Carica informazioni training se disponibili
    training_info_path = f"{latest_model_dir}/training_info.json"
    training_log_path = f"{latest_model_dir}/training_log.csv"
    
    training_info = None
    training_history = None
    
    if os.path.exists(training_info_path):
        with open(training_info_path, 'r') as f:
            training_info = json.load(f)
        print(f"   Training completato in: {training_info.get('duration_seconds', 0):.0f} secondi")
        print(f"   Epoch totali: {training_info.get('total_epochs', 'N/A')}")
        print(f"   Migliore val_accuracy: {training_info.get('best_val_accuracy', 'N/A'):.4f}")
    
    if os.path.exists(training_log_path):
        training_history = pd.read_csv(training_log_path)
        print(f"   Log training disponibile: {len(training_history)} epoch")
    else:
        print("   Log training non disponibile")

print(f"\n📋 SETUP COMPLETATO:")
print(f"   Modello: {'✅ Addestrato' if model_dirs else '🔄 Demo'}")
print(f"   Dataset: {'✅ Preprocessato' if os.path.exists(f'{processed_dir}/train_dataset') else '🔄 Demo'}")
print(f"   Classi: {num_classes}")
print(f"   Input shape: {INPUT_SHAPE}")

=== CARICAMENTO DATASET E MODELLO ===
✅ Caricamento dataset preprocessati...
Instructions for updating:
Use `tf.data.Dataset.load(...)` instead.
   Dataset caricati: 9 classi, shape (256, 256, 3)
📥 Caricamento modello da: ../models\cnn_from_scratch_20250729_174718
   Dataset caricati: 9 classi, shape (256, 256, 3)
📥 Caricamento modello da: ../models\cnn_from_scratch_20250729_174718


FileNotFoundError: Modello non trovato in: ../models\cnn_from_scratch_20250729_174718/checkpoints/best_model.h5

# 3. Analisi Curve di Training

Analizziamo l'andamento del training attraverso le curve di loss e accuracy.

In [None]:
if training_history is not None:
    print("=== ANALISI CURVE DI TRAINING ===")
    
    # Estrazione dati
    epochs_range = range(1, len(training_history) + 1)
    train_accuracy = training_history['accuracy']
    val_accuracy = training_history['val_accuracy']
    train_loss = training_history['loss']
    val_loss = training_history['val_loss']
    
    # Verifica se esiste top3_accuracy
    has_top3 = 'top3_accuracy' in training_history.columns
    if has_top3:
        train_top3 = training_history['top3_accuracy']
        val_top3 = training_history['val_top3_accuracy']
    
    # Creazione subplot per analisi completa
    fig_size = (16, 12) if has_top3 else (12, 8)
    n_plots = 4 if has_top3 else 3
    fig, axes = plt.subplots(2, 2, figsize=fig_size)
    
    # 1. Accuracy Curves
    axes[0, 0].plot(epochs_range, train_accuracy, 'b-', label='Training Accuracy', linewidth=2)
    axes[0, 0].plot(epochs_range, val_accuracy, 'r-', label='Validation Accuracy', linewidth=2)
    axes[0, 0].set_title('Model Accuracy', fontsize=14, fontweight='bold')
    axes[0, 0].set_xlabel('Epoch')
    axes[0, 0].set_ylabel('Accuracy')
    axes[0, 0].legend()
    axes[0, 0].grid(True, alpha=0.3)
    axes[0, 0].set_ylim([0, 1])
    
    # 2. Loss Curves
    axes[0, 1].plot(epochs_range, train_loss, 'b-', label='Training Loss', linewidth=2)
    axes[0, 1].plot(epochs_range, val_loss, 'r-', label='Validation Loss', linewidth=2)
    axes[0, 1].set_title('Model Loss', fontsize=14, fontweight='bold')
    axes[0, 1].set_xlabel('Epoch')
    axes[0, 1].set_ylabel('Loss')
    axes[0, 1].legend()
    axes[0, 1].grid(True, alpha=0.3)
    
    # 3. Overfitting Analysis
    accuracy_gap = train_accuracy - val_accuracy
    axes[1, 0].plot(epochs_range, accuracy_gap, 'red', linewidth=2, label='Train - Val Accuracy')
    axes[1, 0].axhline(y=0, color='black', linestyle='--', alpha=0.5)
    axes[1, 0].axhline(y=0.1, color='orange', linestyle='--', alpha=0.5, label='Overfitting Threshold')
    axes[1, 0].set_title('Overfitting Analysis', fontsize=14, fontweight='bold')
    axes[1, 0].set_xlabel('Epoch')
    axes[1, 0].set_ylabel('Accuracy Gap')
    axes[1, 0].legend()
    axes[1, 0].grid(True, alpha=0.3)
    
    # 4. Top-3 Accuracy o Learning Rate
    if has_top3:
        axes[1, 1].plot(epochs_range, train_top3, 'g-', label='Training Top-3', linewidth=2)
        axes[1, 1].plot(epochs_range, val_top3, 'orange', label='Validation Top-3', linewidth=2)
        axes[1, 1].set_title('Top-3 Accuracy', fontsize=14, fontweight='bold')
        axes[1, 1].set_ylabel('Top-3 Accuracy')
        axes[1, 1].set_ylim([0, 1])
    else:
        # Smooth delle curve per trend analysis
        window = max(1, len(train_accuracy) // 10)
        smooth_train_acc = train_accuracy.rolling(window=window, center=True).mean()
        smooth_val_acc = val_accuracy.rolling(window=window, center=True).mean()
        
        axes[1, 1].plot(epochs_range, smooth_train_acc, 'b-', label='Smooth Train Acc', linewidth=2)
        axes[1, 1].plot(epochs_range, smooth_val_acc, 'r-', label='Smooth Val Acc', linewidth=2)
        axes[1, 1].set_title('Smoothed Accuracy Trends', fontsize=14, fontweight='bold')
        axes[1, 1].set_ylabel('Smoothed Accuracy')
    
    axes[1, 1].set_xlabel('Epoch')
    axes[1, 1].legend()
    axes[1, 1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(f"{latest_model_dir}/detailed_training_analysis.png", dpi=300, bbox_inches='tight')
    plt.show()
    
    # Statistiche training
    final_train_acc = train_accuracy.iloc[-1]
    final_val_acc = val_accuracy.iloc[-1]
    best_val_acc = val_accuracy.max()
    best_val_epoch = val_accuracy.idxmax() + 1
    final_overfitting_gap = final_train_acc - final_val_acc
    
    print(f"📈 STATISTICHE TRAINING:")
    print(f"   Accuracy finale - Train: {final_train_acc:.4f}, Val: {final_val_acc:.4f}")
    print(f"   Migliore Val Accuracy: {best_val_acc:.4f} (epoch {best_val_epoch})")
    print(f"   Gap finale Train-Val: {final_overfitting_gap:.4f}")
    
    if final_overfitting_gap > 0.1:
        print(f"⚠️  Possibile overfitting (gap > 0.1)")
    else:
        print(f"✅ Overfitting sotto controllo")
        
else:
    print("⚠️ Storia del training non disponibile - saltando analisi curve")

# 4. Valutazione sul Test Set

Valutiamo le performance finali del modello sui dati di test mai visti durante il training.

In [None]:
print("=== VALUTAZIONE SUL TEST SET ===")

# Valutazione quantitativa
print("🔍 Valutazione in corso...")
test_results = model.evaluate(test_dataset, verbose=1)

# Estrai metriche (order depends on model compilation)
test_loss = test_results[0]
test_accuracy = test_results[1]
if len(test_results) > 2:
    test_top3_accuracy = test_results[2]
else:
    test_top3_accuracy = None

print(f"\n📊 RISULTATI FINALI SUL TEST SET:")
print(f"   Test Loss: {test_loss:.4f}")
print(f"   Test Accuracy: {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")
if test_top3_accuracy:
    print(f"   Test Top-3 Accuracy: {test_top3_accuracy:.4f} ({test_top3_accuracy*100:.2f}%)")

# Generazione predizioni per analisi dettagliata
print("\n🎯 Generazione predizioni dettagliate...")
y_true = []
y_pred = []
y_pred_proba = []

for batch_images, batch_labels in test_dataset:
    predictions = model.predict(batch_images, verbose=0)
    predicted_classes = np.argmax(predictions, axis=1)
    
    y_true.extend(batch_labels.numpy())
    y_pred.extend(predicted_classes)
    y_pred_proba.extend(predictions)

y_true = np.array(y_true)
y_pred = np.array(y_pred)
y_pred_proba = np.array(y_pred_proba)

print(f"✅ Analisi completata su {len(y_true)} campioni di test")

# Calcolo accuracy manuale per verifica
manual_accuracy = np.mean(y_true == y_pred)
print(f"   Verifica accuracy: {manual_accuracy:.4f}")

# 5. Classification Report e Confusion Matrix

Generiamo analisi dettagliate per classe delle performance del modello.

In [None]:
print("=== CLASSIFICATION REPORT ===")
class_report = classification_report(
    y_true, y_pred, 
    target_names=class_names,
    digits=4
)
print(class_report)

# Salva il report
with open(f"{latest_model_dir}/classification_report.txt", 'w') as f:
    f.write(class_report)

print("\n=== CONFUSION MATRIX ===")
cm = confusion_matrix(y_true, y_pred)

# Visualizzazione della confusion matrix
plt.figure(figsize=(14, 12))
sns.heatmap(
    cm, 
    annot=True, 
    fmt='d', 
    cmap='Blues',
    xticklabels=class_names,
    yticklabels=class_names,
    cbar_kws={'label': 'Numero di Campioni'}
)
plt.title('Confusion Matrix - Test Set', fontsize=16, fontweight='bold')
plt.xlabel('Predetto', fontsize=12)
plt.ylabel('Reale', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.savefig(f"{latest_model_dir}/confusion_matrix.png", dpi=300, bbox_inches='tight')
plt.show()

# Accuracy per classe
class_accuracies = cm.diagonal() / cm.sum(axis=1)
class_support = cm.sum(axis=1)

class_df = pd.DataFrame({
    'Classe': class_names,
    'Accuracy': class_accuracies,
    'Support': class_support,
    'Correct': cm.diagonal(),
    'Total': class_support
}).sort_values('Accuracy', ascending=False)

print("\n📊 PERFORMANCE PER CLASSE:")
print(class_df.to_string(index=False, float_format='%.4f'))

# Visualizzazione accuracy per classe
plt.figure(figsize=(14, 8))
bars = plt.bar(range(len(class_names)), class_accuracies, 
               color='lightcoral', alpha=0.7, edgecolor='black')

plt.xlabel('Classi', fontsize=12)
plt.ylabel('Accuracy', fontsize=12)
plt.title('Accuracy per Classe sul Test Set', fontsize=14, fontweight='bold')
plt.xticks(range(len(class_names)), class_names, rotation=45, ha='right')
plt.ylim(0, 1.1)
plt.grid(axis='y', alpha=0.3)

# Aggiungi valori e support sulle barre
for i, (bar, acc, support) in enumerate(zip(bars, class_accuracies, class_support)):
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height + 0.02,
             f'{acc:.3f}\n({support})', ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.savefig(f"{latest_model_dir}/accuracy_per_class.png", dpi=300, bbox_inches='tight')
plt.show()

# 6. Analisi degli Errori e Confidence

Analizziamo gli errori del modello e la distribuzione dei confidence scores.

In [None]:
print("=== ANALISI CONFIDENCE E ERRORI ===")

# Confidence analysis
max_confidences = np.max(y_pred_proba, axis=1)
correct_predictions = (y_true == y_pred)

# Top-3 analysis se disponibile
if y_pred_proba.shape[1] >= 3:
    top3_predictions = np.argsort(y_pred_proba, axis=1)[:, -3:]
    top3_correct = np.array([y_true[i] in top3_predictions[i] for i in range(len(y_true))])
    top3_accuracy_manual = np.mean(top3_correct)
    print(f"Top-3 Accuracy (calcolata): {top3_accuracy_manual:.4f}")

fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Distribuzione confidence
axes[0, 0].hist(max_confidences[correct_predictions], bins=30, alpha=0.7, 
                label='Predizioni Corrette', color='green', density=True)
axes[0, 0].hist(max_confidences[~correct_predictions], bins=30, alpha=0.7, 
                label='Predizioni Errate', color='red', density=True)
axes[0, 0].set_xlabel('Confidence Score')
axes[0, 0].set_ylabel('Densità')
axes[0, 0].set_title('Distribuzione Confidence Scores')
axes[0, 0].legend()
axes[0, 0].grid(alpha=0.3)

# 2. Accuracy vs Confidence bins
confidence_bins = np.linspace(0, 1, 11)
bin_accuracies = []
bin_counts = []

for i in range(len(confidence_bins)-1):
    mask = (max_confidences >= confidence_bins[i]) & (max_confidences < confidence_bins[i+1])
    if np.sum(mask) > 0:
        bin_accuracy = np.mean(correct_predictions[mask])
        bin_count = np.sum(mask)
    else:
        bin_accuracy = 0
        bin_count = 0
    bin_accuracies.append(bin_accuracy)
    bin_counts.append(bin_count)

bin_centers = (confidence_bins[:-1] + confidence_bins[1:]) / 2
bars = axes[0, 1].bar(bin_centers, bin_accuracies, width=0.08, alpha=0.7, color='orange')
axes[0, 1].set_xlabel('Confidence Range')
axes[0, 1].set_ylabel('Accuracy')
axes[0, 1].set_title('Accuracy vs Confidence')
axes[0, 1].set_ylim(0, 1)
axes[0, 1].grid(alpha=0.3)

# Aggiungi count labels
for bar, count in zip(bars, bin_counts):
    if count > 0:
        height = bar.get_height()
        axes[0, 1].text(bar.get_x() + bar.get_width()/2., height + 0.02,
                        f'{count}', ha='center', va='bottom', fontsize=8)

# 3. Errori per confidence threshold
thresholds = np.linspace(0.5, 1.0, 11)
accuracies_at_threshold = []
coverage_at_threshold = []

for threshold in thresholds:
    mask = max_confidences >= threshold
    if np.sum(mask) > 0:
        acc = np.mean(correct_predictions[mask])
        coverage = np.sum(mask) / len(mask)
    else:
        acc = 0
        coverage = 0
    accuracies_at_threshold.append(acc)
    coverage_at_threshold.append(coverage)

axes[1, 0].plot(thresholds, accuracies_at_threshold, 'b-o', label='Accuracy', linewidth=2)
axes[1, 0].plot(thresholds, coverage_at_threshold, 'r-s', label='Coverage', linewidth=2)
axes[1, 0].set_xlabel('Confidence Threshold')
axes[1, 0].set_ylabel('Metric Value')
axes[1, 0].set_title('Accuracy-Coverage Trade-off')
axes[1, 0].legend()
axes[1, 0].grid(alpha=0.3)
axes[1, 0].set_ylim(0, 1)

# 4. Matrice delle classi più confuse
# Trova le coppie di classi più confuse (errori off-diagonal)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
np.fill_diagonal(cm_normalized, 0)  # Rimuovi diagonale

# Top 10 errori più frequenti
confused_pairs = []
for i in range(len(class_names)):
    for j in range(len(class_names)):
        if i != j and cm[i, j] > 0:
            confused_pairs.append((i, j, cm[i, j], cm_normalized[i, j]))

confused_pairs.sort(key=lambda x: x[2], reverse=True)
top_confusions = confused_pairs[:10]

if top_confusions:
    confusion_data = []
    for true_idx, pred_idx, count, rate in top_confusions:
        confusion_data.append({
            'True Class': class_names[true_idx],
            'Predicted Class': class_names[pred_idx],
            'Count': count,
            'Error Rate': rate
        })
    
    confusion_df = pd.DataFrame(confusion_data)
    
    # Grafico a barre delle confusioni principali
    y_pos = np.arange(len(confusion_df))
    axes[1, 1].barh(y_pos, confusion_df['Count'], alpha=0.7, color='salmon')
    axes[1, 1].set_yticks(y_pos)
    axes[1, 1].set_yticklabels([f"{row['True Class']} → {row['Predicted Class']}" 
                                for _, row in confusion_df.iterrows()], fontsize=8)
    axes[1, 1].set_xlabel('Numero di Errori')
    axes[1, 1].set_title('Top 10 Confusioni tra Classi')
    axes[1, 1].grid(axis='x', alpha=0.3)
else:
    axes[1, 1].text(0.5, 0.5, 'Nessuna confusione\nsignificativa rilevata', 
                    ha='center', va='center', transform=axes[1, 1].transAxes, fontsize=12)
    axes[1, 1].set_title('Analisi Confusioni')

plt.tight_layout()
plt.savefig(f"{latest_model_dir}/error_analysis.png", dpi=300, bbox_inches='tight')
plt.show()

# Statistiche confidence
avg_confidence_correct = np.mean(max_confidences[correct_predictions])
avg_confidence_wrong = np.mean(max_confidences[~correct_predictions])

print(f"\n📈 STATISTICHE CONFIDENCE:")
print(f"   Confidence media (corrette): {avg_confidence_correct:.4f}")
print(f"   Confidence media (errate): {avg_confidence_wrong:.4f}")
print(f"   Differenza: {avg_confidence_correct - avg_confidence_wrong:.4f}")

if top_confusions:
    print(f"\n🎯 TOP 5 CONFUSIONI:")
    for i, (true_idx, pred_idx, count, rate) in enumerate(top_confusions[:5]):
        print(f"   {i+1}. {class_names[true_idx]} → {class_names[pred_idx]}: "
              f"{count} errori ({rate:.2%})")

# 7. Riepilogo Finale e Salvataggio Risultati

Generiamo un riepilogo completo delle performance e salviamo tutti i risultati.

In [None]:
print("=== RIEPILOGO FINALE ===")

# Raccolta di tutte le metriche
final_results = {
    'model_path': latest_model_dir,
    'evaluation_date': datetime.now().isoformat(),
    'test_samples': int(len(y_true)),
    'num_classes': int(num_classes),
    'test_accuracy': float(test_accuracy),
    'test_loss': float(test_loss),
    'avg_confidence_correct': float(avg_confidence_correct),
    'avg_confidence_wrong': float(avg_confidence_wrong),
    'confidence_gap': float(avg_confidence_correct - avg_confidence_wrong),
}

if test_top3_accuracy:
    final_results['test_top3_accuracy'] = float(test_top3_accuracy)

if training_history is not None:
    final_results.update({
        'final_train_accuracy': float(final_train_acc),
        'final_val_accuracy': float(final_val_acc),
        'best_val_accuracy': float(best_val_acc),
        'best_val_epoch': int(best_val_epoch),
        'overfitting_gap': float(final_overfitting_gap),
        'total_training_epochs': len(training_history)
    })

# Performance per classe
class_metrics = {}
for i, class_name in enumerate(class_names):
    class_metrics[class_name] = {
        'accuracy': float(class_accuracies[i]),
        'support': int(class_support[i]),
        'correct_predictions': int(cm.diagonal()[i])
    }

final_results['class_performance'] = class_metrics

# Salva risultati completi
results_path = f"{latest_model_dir}/complete_evaluation_results.json"
with open(results_path, 'w') as f:
    json.dump(final_results, f, indent=4)

# Crea summary testuale
summary_text = f"""
RIEPILOGO VALUTAZIONE MODELLO CNN
================================

Data Valutazione: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
Modello: {latest_model_dir}

PERFORMANCE TEST SET:
- Accuracy: {test_accuracy:.4f} ({test_accuracy*100:.2f}%)
- Loss: {test_loss:.4f}
- Campioni testati: {len(y_true):,}
"""

if test_top3_accuracy:
    summary_text += f"- Top-3 Accuracy: {test_top3_accuracy:.4f} ({test_top3_accuracy*100:.2f}%)\n"

if training_history is not None:
    summary_text += f"""
TRAINING INFO:
- Epoch totali: {len(training_history)}
- Migliore Val Accuracy: {best_val_acc:.4f} (epoch {best_val_epoch})
- Overfitting gap finale: {final_overfitting_gap:.4f}
"""

summary_text += f"""
CONFIDENCE ANALYSIS:
- Confidence media (corrette): {avg_confidence_correct:.4f}
- Confidence media (errate): {avg_confidence_wrong:.4f}
- Gap confidence: {avg_confidence_correct - avg_confidence_wrong:.4f}

PERFORMANCE PER CLASSE:
"""

for class_name, acc in zip(class_names, class_accuracies):
    summary_text += f"- {class_name}: {acc:.4f}\n"

summary_text += f"""
FILES GENERATI:
- Risultati completi: complete_evaluation_results.json
- Classification report: classification_report.txt
- Confusion matrix: confusion_matrix.png
- Analisi training: detailed_training_analysis.png
- Analisi errori: error_analysis.png
- Accuracy per classe: accuracy_per_class.png
"""

# Salva summary
summary_path = f"{latest_model_dir}/evaluation_summary.txt"
with open(summary_path, 'w') as f:
    f.write(summary_text)

print(summary_text)

print(f"\n✅ VALUTAZIONE COMPLETATA!")
print(f"📁 Tutti i file salvati in: {latest_model_dir}")
print(f"📊 Risultati principali:")
print(f"   - Test Accuracy: {test_accuracy:.4f}")
print(f"   - Confidence Gap: {avg_confidence_correct - avg_confidence_wrong:.4f}")
if training_history is not None:
    print(f"   - Overfitting: {'⚠️ Presente' if final_overfitting_gap > 0.1 else '✅ Controllato'}")