In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.models import load_model
from model_maker import load_data_from_folder  # For PCP data
from robust_model_maker import load_data_from_folder as load_robust_data  # For robust features
import tensorflow as tf
import pandas as pd
import time
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import label_binarize

In [None]:
def load_and_preprocess_pcp_data(folder):
    """
    Load PCP data and ensure correct shape
    Returns X with 12 features and y with 24 classes
    """
    X, y = load_data_from_folder(folder)
    # Since the original function returns y, X we need to swap them
    return y, X  # Now X has 12 features, y has 24 classes

def load_and_preprocess_robust_data(folder):
    """
    Load robust feature data and ensure correct shape
    Returns X with 45 features and y with 24 classes
    """
    X, y = load_robust_data(folder)
    return y,X

In [None]:
chord_list = ['Cmaj', 'Cmin', 'C#maj', 'C#min', 'Dmaj', 'Dmin', 'D#maj', 'D#min', 
                 'Emaj', 'Emin', 'Fmaj', 'Fmin', 'F#maj', 'F#min', 'Gmaj', 'Gmin', 
                 'G#maj', 'G#min', 'Amaj', 'Amin', 'A#maj', 'A#min', 'Bmaj', 'Bmin']
print("\nLoading PCP test data...")
X_test_pcp, y_test = load_and_preprocess_pcp_data("extracted_pcp_annotations_12_bin")
print(f"PCP data shapes - X: {X_test_pcp.shape}, y: {y_test.shape}")

In [None]:
print("\nLoading Robust test data...")
X_test_robust, y_test_robust = load_and_preprocess_robust_data("extracted_robust_45_annotations")
print(f"Robust data shapes - X: {X_test_robust.shape}, y: {y_test_robust.shape}")

In [None]:
print("\nLoading PCP model...")
pcp_model = load_model("pcpmodel_1000.h5")

In [None]:
def calculate_metrics(y_true, y_pred, chord_list):
    """
    Calculate comprehensive metrics including AUC for model evaluation
    """
    # Basic metrics (same as before)
    accuracy = accuracy_score(y_true.argmax(axis=1), y_pred.argmax(axis=1))
    precision, recall, f1, _ = precision_recall_fscore_support(
        y_true.argmax(axis=1), 
        y_pred.argmax(axis=1), 
        average='weighted'
    )
    
    # Calculate ROC AUC scores
    # Micro-average: Calculate metrics globally by considering each element of the label indicator matrix as a label
    auc_micro = roc_auc_score(y_true, y_pred, average='micro')
    
    # Macro-average: Calculate metrics for each label, and find their unweighted mean
    auc_macro = roc_auc_score(y_true, y_pred, average='macro')
    
    # Calculate AUC for each chord
    per_chord_auc = {}
    for i, chord in enumerate(chord_list):
        per_chord_auc[chord] = roc_auc_score(y_true[:, i], y_pred[:, i])
    
    # Per-class metrics (same as before)
    class_precision, class_recall, class_f1, _ = precision_recall_fscore_support(
        y_true.argmax(axis=1), 
        y_pred.argmax(axis=1)
    )
    
    # Confusion matrix
    conf_matrix = confusion_matrix(y_true.argmax(axis=1), y_pred.argmax(axis=1))
    
    # Create per-chord performance dictionary
    chord_performance = {}
    for i, chord in enumerate(chord_list):
        chord_performance[chord] = {
            'precision': class_precision[i],
            'recall': class_recall[i],
            'f1': class_f1[i],
            'auc': per_chord_auc[chord]
        }
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'auc_micro': auc_micro,
        'auc_macro': auc_macro,
        'per_chord_auc': per_chord_auc,
        'confusion_matrix': conf_matrix,
        'chord_performance': chord_performance
    }

def plot_auc_comparison(pcp_metrics, robust_metrics, chord_list, save_path):
    """
    Plot AUC comparison for both models
    """
    plt.figure(figsize=(15, 6))
    
    x = np.arange(len(chord_list))
    width = 0.35
    
    pcp_aucs = [pcp_metrics['per_chord_auc'][chord] for chord in chord_list]
    robust_aucs = [robust_metrics['per_chord_auc'][chord] for chord in chord_list]
    
    plt.bar(x - width/2, pcp_aucs, width, label='PCP Model')
    plt.bar(x + width/2, robust_aucs, width, label='Robust Model')
    
    plt.ylabel('AUC Score')
    plt.title('Per-Chord AUC Comparison')
    plt.xticks(x, chord_list, rotation=45, ha='right')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(save_path)
    plt.close()

def measure_inference_time(model, X_test, batch_sizes=[1, 8, 16, 32]):
    """
    Measure average inference time across multiple batch sizes
    """
    timing_results = {}
    
    for batch_size in batch_sizes:
        times = []
        # Ensure we have enough samples
        num_batches = min(100, len(X_test) // batch_size)
        
        for i in range(num_batches):
            batch = X_test[i*batch_size:(i+1)*batch_size]
            start_time = time.time()
            model.predict(batch, verbose=0)
            end_time = time.time()
            times.append(end_time - start_time)
        
        timing_results[batch_size] = {
            'mean_time': np.mean(times),
            'std_time': np.std(times),
            'min_time': np.min(times),
            'max_time': np.max(times)
        }
    
    return timing_results

def plot_confusion_matrix(conf_matrix, chord_list, title, save_path):
    """
    Plot and save confusion matrix heatmap with chord labels
    """
    plt.figure(figsize=(20, 20))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
                xticklabels=chord_list, yticklabels=chord_list)
    plt.title(f'Confusion Matrix - {title}')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.xticks(rotation=45)
    plt.yticks(rotation=45)
    plt.tight_layout()
    plt.savefig(save_path)
    plt.close()

def plot_performance_comparison(pcp_metrics, robust_metrics, save_path):
    """
    Plot performance comparison bar chart
    """
    metrics = ['accuracy', 'precision', 'recall', 'f1']
    pcp_values = [pcp_metrics[m] for m in metrics]
    robust_values = [robust_metrics[m] for m in metrics]
    
    plt.figure(figsize=(10, 6))
    x = np.arange(len(metrics))
    width = 0.35
    
    plt.bar(x - width/2, pcp_values, width, label='PCP Model')
    plt.bar(x + width/2, robust_values, width, label='Robust Model')
    
    plt.ylabel('Score')
    plt.title('Model Performance Comparison')
    plt.xticks(x, metrics)
    plt.legend()
    plt.tight_layout()
    plt.savefig(save_path)
    plt.close()

In [None]:
print("\nEvaluating PCP Model...")
pcp_predictions = pcp_model.predict(X_test_pcp, verbose=0)
pcp_metrics = calculate_metrics(y_test, pcp_predictions, chord_list)
pcp_timing = measure_inference_time(pcp_model, X_test_pcp)

In [None]:
print("\nLoading Robust model...")
robust_model = load_model("robust_extraction_1000files.h5")

In [None]:
print("\nEvaluating Robust Model...")
robust_predictions = robust_model.predict(X_test_robust, verbose=0)
robust_metrics = calculate_metrics(y_test_robust, robust_predictions, chord_list)
robust_timing = measure_inference_time(robust_model, X_test_robust)

In [None]:
plot_confusion_matrix(pcp_metrics['confusion_matrix'], 
                         chord_list, 'PCP Model', 'evals/pcp_confusion.png')
plot_confusion_matrix(robust_metrics['confusion_matrix'], 
                         chord_list, 'Robust Model', 'evals/robust_confusion.png')
plot_performance_comparison(pcp_metrics, robust_metrics, 'evals/model_comparison.png')
plot_auc_comparison(pcp_metrics, robust_metrics, chord_list, 'evals/auc_comparison.png')

In [None]:
print("\n=== Model Comparison Results ===")
    
print("\nBasic Metrics:")
metrics = ['accuracy', 'precision', 'recall', 'f1', 'auc_micro', 'auc_macro']
for metric in metrics:
    print(f"\n{metric.title()}:")
    print(f"PCP Model: {pcp_metrics[metric]:.4f}")
    print(f"Robust Model: {robust_metrics[metric]:.4f}")

# Save results to CSV with AUC metrics
results = {
    'Model': ['PCP Model', 'Robust Model'],
    'Accuracy': [pcp_metrics['accuracy'], robust_metrics['accuracy']],
    'Precision': [pcp_metrics['precision'], robust_metrics['precision']],
    'Recall': [pcp_metrics['recall'], robust_metrics['recall']],
    'F1 Score': [pcp_metrics['f1'], robust_metrics['f1']],
    'AUC (Micro)': [pcp_metrics['auc_micro'], robust_metrics['auc_micro']],
    'AUC (Macro)': [pcp_metrics['auc_macro'], robust_metrics['auc_macro']]
}

# Add timing results
for batch_size in pcp_timing.keys():
    results[f'Inference Time (ms) - Batch {batch_size}'] = [
        pcp_timing[batch_size]['mean_time']*1000,
        robust_timing[batch_size]['mean_time']*1000
    ]

# Save to CSV
pd.DataFrame(results).to_csv('evals/model_comparison_results.csv', index=False)
    

In [None]:

# Save per-chord performance including AUC
chord_results = []
for chord in chord_list:
    chord_results.append({
        'Chord': chord,
        'PCP_Precision': pcp_metrics['chord_performance'][chord]['precision'],
        'PCP_Recall': pcp_metrics['chord_performance'][chord]['recall'],
        'PCP_F1': pcp_metrics['chord_performance'][chord]['f1'],
        'PCP_AUC': pcp_metrics['chord_performance'][chord]['auc'],
        'Robust_Precision': robust_metrics['chord_performance'][chord]['precision'],
        'Robust_Recall': robust_metrics['chord_performance'][chord]['recall'],
        'Robust_F1': robust_metrics['chord_performance'][chord]['f1'],
        'Robust_AUC': robust_metrics['chord_performance'][chord]['auc']
    })

pd.DataFrame(chord_results).to_csv('evals/per_chord_performance.csv', index=False)