1.Import necessary libraries

In [None]:
# ## 1. Import Necessary Libraries

# %%
# 禁用警告
import warnings
warnings.filterwarnings('ignore')

# 核心库
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from typing import List, Tuple, Dict, Any, Optional
import os
import json
import pickle
from datetime import datetime

# 机器学习指标
from sklearn.metrics import (classification_report, confusion_matrix, 
                           precision_recall_curve, roc_curve, auc, 
                           precision_score, recall_score, f1_score, 
                           accuracy_score)
from sklearn.preprocessing import label_binarize

# TensorFlow
try:
    import tensorflow as tf
    from tensorflow.keras import models
    from tensorflow.keras.utils import to_categorical
except ImportError as e:
    print(f"TensorFlow import error: {e}")

# 可视化设置
plt.style.use('ggplot')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)
np.random.seed(42)

print("Libraries imported successfully!")

 2. Load Data and Models

In [None]:
def load_evaluation_data(data_path: str = '../data/augmented') -> Tuple:
    """加载评估数据"""
    print("Loading evaluation data...")
    
    x_test = np.load(os.path.join(data_path, 'x_test_aug.npy'))
    y_test = np.load(os.path.join(data_path, 'y_test_aug.npy'))
    
    with open(os.path.join(data_path, 'class_names.txt'), 'r', encoding='utf-8') as f:
        class_names = [line.strip() for line in f.readlines()]
    
    num_classes = len(class_names)
    y_test_cat = to_categorical(y_test, num_classes)
    
    print(f"Test data: {x_test.shape} -> {y_test_cat.shape}")
    print(f"Number of classes: {num_classes}")
    
    return x_test, y_test, y_test_cat, class_names, num_classes

def load_trained_models(results_path: str = '../results') -> Dict[str, Any]:
    """加载训练好的模型和结果"""
    print("Loading trained models and results...")
    
    models_info = {}
    
    try:
        # 加载最佳模型信息
        with open(os.path.join(results_path, 'best_model_info.json'), 'r') as f:
            best_model_info = json.load(f)
        
        # 加载训练结果
        training_results = pd.read_csv(os.path.join(results_path, 'training_results.csv'))
        validation_results = pd.read_csv(os.path.join(results_path, 'validation_results.csv'))
        
        # 加载训练历史
        with open(os.path.join(results_path, 'training_history.json'), 'r') as f:
            training_history = json.load(f)
        
        models_info['best_model'] = best_model_info
        models_info['training_results'] = training_results
        models_info['validation_results'] = validation_results
        models_info['training_history'] = training_history
        
        print("Model results loaded successfully!")
        
    except FileNotFoundError as e:
        print(f"Results files not found: {e}")
    
    return models_info

def load_model_predictions(model_path: str, model_name: str, 
                          x_test: np.ndarray, class_names: List[str]) -> Dict[str, Any]:
    """加载模型并进行预测"""
    
    model_file = os.path.join(model_path, f"{model_name}_best.h5")
    
    try:
        model = models.load_model(model_file)
        
        # 进行预测
        y_pred_proba = model.predict(x_test, verbose=0)
        y_pred = np.argmax(y_pred_proba, axis=1)
        
        # 计算置信度
        confidence_scores = np.max(y_pred_proba, axis=1)
        
        predictions = {
            'model': model,
            'y_pred': y_pred,
            'y_pred_proba': y_pred_proba,
            'confidence_scores': confidence_scores,
            'model_name': model_name
        }
        
        print(f"Predictions completed for {model_name}")
        return predictions
        
    except Exception as e:
        print(f"Error loading model {model_name}: {e}")
        return None

# 加载数据
x_test, y_test, y_test_cat, CLASS_NAMES, NUM_CLASSES = load_evaluation_data()

# 加载模型信息
models_info = load_trained_models()

3. Comprehensive Model Evaluation

In [None]:
def evaluate_model_performance(y_true: np.ndarray, y_pred: np.ndarray, 
                             y_pred_proba: np.ndarray, class_names: List[str]) -> Dict[str, Any]:
    """全面评估模型性能"""
    
    # 基础指标
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    
    # 分类报告
    clf_report = classification_report(y_true, y_pred, target_names=class_names, output_dict=True)
    
    # 混淆矩阵
    cm = confusion_matrix(y_true, y_pred)
    
    # 多类ROC曲线（OvR策略）
    y_true_bin = label_binarize(y_true, classes=range(len(class_names)))
    
    # 计算每个类别的ROC曲线和AUC
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    
    for i in range(len(class_names)):
        fpr[i], tpr[i], _ = roc_curve(y_true_bin[:, i], y_pred_proba[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])
    
    # 计算微平均ROC曲线
    fpr["micro"], tpr["micro"], _ = roc_curve(y_true_bin.ravel(), y_pred_proba.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
    
    # 计算宏平均ROC曲线
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(len(class_names))]))
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(len(class_names)):
        mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])
    mean_tpr /= len(class_names)
    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
    
    results = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'classification_report': clf_report,
        'confusion_matrix': cm,
        'roc_curves': {
            'fpr': fpr,
            'tpr': tpr,
            'roc_auc': roc_auc
        },
        'per_class_metrics': {
            'precision': precision_score(y_true, y_pred, average=None),
            'recall': recall_score(y_true, y_pred, average=None),
            'f1': f1_score(y_true, y_pred, average=None)
        }
    }
    
    return results

4. Performance Visualization

In [None]:
def plot_confusion_matrix(cm: np.ndarray, class_names: List[str], model_name: str, normalize: bool = True) -> None:
    # 绘制混淆矩阵
    
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        title = f'{model_name} - Normalized Confusion Matrix'
    else:
        title = f'{model_name} - Confusion Matrix'
    
    plt.figure(figsize=(12, 10))
    sns.heatmap(cm, annot=True, fmt='.2f' if normalize else 'd', 
                cmap='Blues', xticklabels=class_names, yticklabels=class_names)
    plt.title(title, fontsize=16, pad=20)
    plt.xlabel('Predicted Label', fontsize=12)
    plt.ylabel('True Label', fontsize=12)
    plt.xticks(rotation=45)
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.show()

def plot_roc_curves(roc_data: Dict[str, Any], class_names: List[str], 
                   model_name: str) -> None:
    # 绘制ROC曲线
    
    fpr = roc_data['fpr']
    tpr = roc_data['tpr']
    roc_auc = roc_data['roc_auc']
    
    plt.figure(figsize=(12, 8))
    
    # 绘制每个类别的ROC曲线
    colors = plt.cm.Set1(np.linspace(0, 1, len(class_names)))
    for i, color in zip(range(len(class_names)), colors):
        plt.plot(fpr[i], tpr[i], color=color, lw=2,
                label=f'{class_names[i]} (AUC = {roc_auc[i]:.3f})')
    
    # 绘制微平均和宏平均ROC曲线
    plt.plot(fpr["micro"], tpr["micro"],
            label=f'Micro-average (AUC = {roc_auc["micro"]:.3f})',
            color='deeppink', linestyle=':', linewidth=4)
    
    plt.plot(fpr["macro"], tpr["macro"],
            label=f'Macro-average (AUC = {roc_auc["macro"]:.3f})',
            color='navy', linestyle=':', linewidth=4)
    
    plt.plot([0, 1], [0, 1], 'k--', lw=2, label='Random Classifier')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate', fontsize=12)
    plt.ylabel('True Positive Rate', fontsize=12)
    plt.title(f'{model_name} - ROC Curves', fontsize=16)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

def plot_precision_recall_curves(y_true: np.ndarray, y_pred_proba: np.ndarray,
                               class_names: List[str], model_name: str) -> None:
    # """绘制精确率-召回率曲线"""
    
    y_true_bin = label_binarize(y_true, classes=range(len(class_names)))
    
    plt.figure(figsize=(12, 8))
    
    colors = plt.cm.Set1(np.linspace(0, 1, len(class_names)))
    for i, color in zip(range(len(class_names)), colors):
        precision, recall, _ = precision_recall_curve(y_true_bin[:, i], y_pred_proba[:, i])
        ap = auc(recall, precision)
        plt.plot(recall, precision, color=color, lw=2,
                label=f'{class_names[i]} (AP = {ap:.3f})')
    
    plt.xlabel('Recall', fontsize=12)
    plt.ylabel('Precision', fontsize=12)
    plt.title(f'{model_name} - Precision-Recall Curves', fontsize=16)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

def plot_per_class_metrics(metrics: Dict[str, np.ndarray], 
                          class_names: List[str], model_name: str) -> None:
   # """绘制每个类别的性能指标"""
    
    fig, axes = plt.subplots(1, 3, figsize=(18, 6))
    
    metrics_data = {
        'Precision': metrics['precision'],
        'Recall': metrics['recall'],
        'F1-Score': metrics['f1']
    }
    
    for idx, (metric_name, values) in enumerate(metrics_data.items()):
        bars = axes[idx].bar(range(len(class_names)), values, color=plt.cm.Set1(range(len(class_names))))
        axes[idx].set_title(f'{metric_name} by Class', fontsize=14)
        axes[idx].set_xlabel('Class', fontsize=12)
        axes[idx].set_ylabel(metric_name, fontsize=12)
        axes[idx].set_xticks(range(len(class_names)))
        axes[idx].set_xticklabels(class_names, rotation=45)
        axes[idx].set_ylim(0, 1.1)
        axes[idx].grid(True, alpha=0.3, axis='y')
        
        # 在柱子上添加数值
        for bar, value in zip(bars, values):
            axes[idx].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
                          f'{value:.3f}', ha='center', va='bottom', fontsize=10)
    
    plt.suptitle(f'{model_name} - Per-Class Performance Metrics', fontsize=16)
    plt.tight_layout()
    plt.show()

5. Error Analysis

In [None]:
def analyze_errors(y_true: np.ndarray, y_pred: np.ndarray, 
                  confidence_scores: np.ndarray, x_test: np.ndarray,
                  class_names: List[str]) -> Dict[str, Any]:
    """分析模型错误"""
    
    errors_mask = (y_true != y_pred)
    correct_mask = (y_true == y_pred)
    
    # 错误统计
    error_indices = np.where(errors_mask)[0]
    correct_indices = np.where(correct_mask)[0]
    
    error_analysis = {
        'total_errors': np.sum(errors_mask),
        'error_rate': np.mean(errors_mask),
        'error_indices': error_indices,
        'correct_indices': correct_indices,
        'error_confidences': confidence_scores[errors_mask],
        'correct_confidences': confidence_scores[correct_mask],
        'misclassification_pairs': []
    }
    
    # 分析错误类型
    error_pairs = {}
    for true_label, pred_label in zip(y_true[errors_mask], y_pred[errors_mask]):
        pair = (true_label, pred_label)
        error_pairs[pair] = error_pairs.get(pair, 0) + 1
    
    # 转换为可读格式
    for (true_idx, pred_idx), count in error_pairs.items():
        error_analysis['misclassification_pairs'].append({
            'true_class': class_names[true_idx],
            'predicted_class': class_names[pred_idx],
            'count': count,
            'percentage': count / len(error_indices) * 100
        })
    
    # 按错误数量排序
    error_analysis['misclassification_pairs'].sort(key=lambda x: x['count'], reverse=True)
    
    return error_analysis

def plot_error_analysis(error_analysis: Dict[str, Any], class_names: List[str],
                       model_name: str) -> None:
    """绘制错误分析图"""
    
    # 置信度分布比较
    fig, axes = plt.subplots(1, 2, figsize=(15, 6))
    
    # 正确和错误预测的置信度分布
    axes[0].hist(error_analysis['correct_confidences'], bins=30, alpha=0.7, 
                label='Correct Predictions', color='green')
    axes[0].hist(error_analysis['error_confidences'], bins=30, alpha=0.7, 
                label='Wrong Predictions', color='red')
    axes[0].set_xlabel('Confidence Score', fontsize=12)
    axes[0].set_ylabel('Frequency', fontsize=12)
    axes[0].set_title('Confidence Distribution: Correct vs Wrong', fontsize=14)
    axes[0].legend()
    axes[0].grid(True, alpha=0.3)
    
    # 最常见的错误类型
    top_errors = error_analysis['misclassification_pairs'][:10]
    error_labels = [f"{err['true_class']}→{err['predicted_class']}" for err in top_errors]
    error_counts = [err['count'] for err in top_errors]
    
    axes[1].barh(error_labels, error_counts, color='coral')
    axes[1].set_xlabel('Error Count', fontsize=12)
    axes[1].set_title('Top 10 Misclassification Patterns', fontsize=14)
    axes[1].grid(True, alpha=0.3, axis='x')
    
    plt.suptitle(f'{model_name} - Error Analysis', fontsize=16)
    plt.tight_layout()
    plt.show()
    
    # 打印错误统计
    print(f"Total errors: {error_analysis['total_errors']}")
    print(f"Error rate: {error_analysis['error_rate']:.3f}")
    print(f"Average confidence (correct): {np.mean(error_analysis['correct_confidences']):.3f}")
    print(f"Average confidence (errors): {np.mean(error_analysis['error_confidences']):.3f}")
    
    print("\nTop misclassification patterns:")
    for i, error in enumerate(top_errors, 1):
        print(f"{i:2d}. {error['true_class']:>12} → {error['predicted_class']:<12} "
              f"({error['count']:3d} times, {error['percentage']:5.1f}%)")

def visualize_misclassified_examples(x_test: np.ndarray, y_true: np.ndarray,
                                   y_pred: np.ndarray, confidence_scores: np.ndarray,
                                   class_names: List[str], model_name: str,
                                   num_examples: int = 10) -> None:
    """可视化错误分类的示例"""
    
    errors_mask = (y_true != y_pred)
    error_indices = np.where(errors_mask)[0]
    
    if len(error_indices) == 0:
        print("No misclassified examples found!")
        return
    
    # 选择一些错误示例
    selected_indices = error_indices[:min(num_examples, len(error_indices))]
    
    # 计算网格大小
    n_cols = 5
    n_rows = (len(selected_indices) + n_cols - 1) // n_cols
    
    plt.figure(figsize=(15, 3 * n_rows))
    
    for i, idx in enumerate(selected_indices):
        plt.subplot(n_rows, n_cols, i + 1)
        plt.imshow(x_test[idx].astype('uint8'))
        plt.title(f'True: {class_names[y_true[idx]]}\nPred: {class_names[y_pred[idx]]}\nConf: {confidence_scores[idx]:.3f}', 
                 fontsize=10)
        plt.axis('off')
    
    plt.suptitle(f'{model_name} - Misclassified Examples', fontsize=16)
    plt.tight_layout()
    plt.show()

6. Model Comparison

In [None]:
def compare_models_performance(models_performance: Dict[str, Dict[str, Any]]) -> pd.DataFrame:
    """比较多个模型的性能"""
    
    comparison_data = []
    
    for model_name, performance in models_performance.items():
        comparison_data.append({
            'Model': model_name,
            'Accuracy': performance['accuracy'],
            'Precision': performance['precision'],
            'Recall': performance['recall'],
            'F1-Score': performance['f1_score'],
            'Micro AUC': performance['roc_curves']['roc_auc']['micro'],
            'Macro AUC': performance['roc_curves']['roc_auc']['macro']
        })
    
    df = pd.DataFrame(comparison_data)
    return df

def plot_model_comparison(comparison_df: pd.DataFrame) -> None:
    """绘制模型比较图"""
    
    metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'Micro AUC', 'Macro AUC']
    
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    axes = axes.flatten()
    
    for i, metric in enumerate(metrics):
        bars = axes[i].bar(comparison_df['Model'], comparison_df[metric], 
                          color=plt.cm.Set1(range(len(comparison_df))))
        axes[i].set_title(f'{metric} Comparison', fontsize=14)
        axes[i].set_ylabel(metric, fontsize=12)
        axes[i].tick_params(axis='x', rotation=45)
        axes[i].set_ylim(0, 1.1)
        axes[i].grid(True, alpha=0.3, axis='y')
        
        # 添加数值标签
        for bar, value in zip(bars, comparison_df[metric]):
            axes[i].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
                        f'{value:.3f}', ha='center', va='bottom', fontsize=10)
    
    plt.suptitle('Model Performance Comparison', fontsize=16)
    plt.tight_layout()
    plt.show()
    
    # 打印比较表格
    print("Model Performance Comparison:")
    print("=" * 80)
    print(comparison_df.round(4))

7. Comprehensive Evaluation Pipeline

In [None]:
def evaluate_single_model(model_name: str, model_path: str, 
                         x_test: np.ndarray, y_test: np.ndarray,
                         class_names: List[str]) -> Dict[str, Any]:
    """单个模型的完整评估流程"""
    
    print(f"\n{'='*60}")
    print(f"Evaluating {model_name}")
    print(f"{'='*60}")
    
    # 加载模型预测
    predictions = load_model_predictions(model_path, model_name, x_test, class_names)
    if predictions is None:
        return None
    
    # 评估性能
    performance = evaluate_model_performance(y_test, predictions['y_pred'], 
                                           predictions['y_pred_proba'], class_names)
    
    # 错误分析
    error_analysis = analyze_errors(y_test, predictions['y_pred'], 
                                  predictions['confidence_scores'], x_test, class_names)
    
    # 可视化
    plot_confusion_matrix(performance['confusion_matrix'], class_names, model_name)
    plot_roc_curves(performance['roc_curves'], class_names, model_name)
    plot_precision_recall_curves(y_test, predictions['y_pred_proba'], class_names, model_name)
    plot_per_class_metrics(performance['per_class_metrics'], class_names, model_name)
    plot_error_analysis(error_analysis, class_names, model_name)
    visualize_misclassified_examples(x_test, y_test, predictions['y_pred'], 
                                   predictions['confidence_scores'], class_names, model_name)
    
    # 打印详细报告
    print("\nDetailed Classification Report:")
    print("-" * 50)
    print(classification_report(y_test, predictions['y_pred'], target_names=class_names))
    
    # 合并结果
    evaluation_results = {
        'model_name': model_name,
        'predictions': predictions,
        'performance': performance,
        'error_analysis': error_analysis
    }
    
    return evaluation_results

# 评估所有可用模型
available_models = ['simple_cnn', 'advanced_cnn', 'lightweight_cnn', 'vgg16_transfer']
models_evaluation = {}

for model_name in available_models:
    results = evaluate_single_model(model_name, '../models', x_test, y_test, CLASS_NAMES)
    if results is not None:
        models_evaluation[model_name] = results


8. Final Results Comparison

In [None]:
# 比较所有模型
if models_evaluation:
    performance_comparison = {}
    for model_name, evaluation in models_evaluation.items():
        performance_comparison[model_name] = evaluation['performance']
    
    comparison_df = compare_models_performance(performance_comparison)
    plot_model_comparison(comparison_df)
    
    # 找到最佳模型
    best_model = comparison_df.loc[comparison_df['Accuracy'].idxmax()]
    print(f"   BEST PERFORMING MODEL: {best_model['Model']}")
    print(f"   Accuracy: {best_model['Accuracy']:.4f}")
    print(f"   F1-Score: {best_model['F1-Score']:.4f}")
    print(f"   Macro AUC: {best_model['Macro AUC']:.4f}")

 9. Save Evaluation Results

In [None]:
def save_evaluation_results(models_evaluation: Dict[str, Any], 
                           comparison_df: pd.DataFrame,
                           output_dir: str = '../results') -> None:
    """保存评估结果"""
    
    os.makedirs(output_dir, exist_ok=True)
    
    # 保存模型比较结果
    comparison_df.to_csv(os.path.join(output_dir, 'model_comparison.csv'), index=False)
    
    # 保存每个模型的详细结果
    evaluation_summary = {}
    for model_name, evaluation in models_evaluation.items():
        # 只保存关键指标，避免保存大型数组
        evaluation_summary[model_name] = {
            'accuracy': evaluation['performance']['accuracy'],
            'precision': evaluation['performance']['precision'],
            'recall': evaluation['performance']['recall'],
            'f1_score': evaluation['performance']['f1_score'],
            'micro_auc': evaluation['performance']['roc_curves']['roc_auc']['micro'],
            'macro_auc': evaluation['performance']['roc_curves']['roc_auc']['macro'],
            'error_rate': evaluation['error_analysis']['error_rate'],
            'total_errors': evaluation['error_analysis']['total_errors'],
            'top_misclassifications': evaluation['error_analysis']['misclassification_pairs'][:5]
        }
    
    with open(os.path.join(output_dir, 'evaluation_summary.json'), 'w') as f:
        json.dump(evaluation_summary, f, indent=2)
    
    # 保存分类报告
    for model_name, evaluation in models_evaluation.items():
        clf_report = evaluation['performance']['classification_report']
        report_df = pd.DataFrame(clf_report).transpose()
        report_df.to_csv(os.path.join(output_dir, f'{model_name}_classification_report.csv'))
    
    print(f"Evaluation results saved to {output_dir}")

# 保存结果
if models_evaluation:
    save_evaluation_results(models_evaluation, comparison_df)


10. Final Summary Report

In [None]:
def generate_final_summary(models_evaluation: Dict[str, Any],
                          comparison_df: pd.DataFrame) -> None:
    """生成最终总结报告"""
    
    print("=" * 70)
    print("FINAL MODEL EVALUATION SUMMARY REPORT")
    print("=" * 70)
    
    print(f"Number of models evaluated: {len(models_evaluation)}")
    print(f"Test dataset size: {len(x_test)} samples")
    print(f"Number of classes: {NUM_CLASSES}")
    
    if not comparison_df.empty:
        best_model = comparison_df.loc[comparison_df['Accuracy'].idxmax()]
        worst_model = comparison_df.loc[comparison_df['Accuracy'].idxmin()]
        
        print(f"\n🏆 BEST MODEL: {best_model['Model']}")
        print(f"   Accuracy: {best_model['Accuracy']:.4f}")
        print(f"   F1-Score: {best_model['F1-Score']:.4f}")
        print(f"   Macro AUC: {best_model['Macro AUC']:.4f}")
        
        print(f"\n📊 Performance Range:")
        print(f"   Accuracy: {comparison_df['Accuracy'].min():.4f} - {comparison_df['Accuracy'].max():.4f}")
        print(f"   F1-Score: {comparison_df['F1-Score'].min():.4f} - {comparison_df['F1-Score'].max():.4f}")
        
        print(f"\n🔍 Key Insights:")
        for model_name, evaluation in models_evaluation.items():
            error_rate = evaluation['error_analysis']['error_rate']
            top_error = evaluation['error_analysis']['misclassification_pairs'][0]
            print(f"   {model_name}: Error rate {error_rate:.3f}, "
                  f"Most common error: {top_error['true_class']}→{top_error['predicted_class']}")
    
    print(f"\n💾 Results saved to: ../results/")
    print("=" * 70)

# 生成最终总结
generate_final_summary(models_evaluation, comparison_df)

print("Model evaluation completed successfully!")