In [1]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.models import load_model
from model_maker import load_data_from_folder  # For PCP data
from robust_model_maker import load_data_from_folder as load_robust_data  # For robust features
import tensorflow as tf
import pandas as pd
import time
from sklearn.metrics import roc_auc_score
import os
from datetime import datetime

# Suppress TensorFlow warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
tf.get_logger().setLevel('ERROR')

2024-11-24 20:38:28.179172: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-24 20:38:28.179266: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-24 20:38:28.180388: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-24 20:38:28.187618: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Define model configurations
MODEL_CONFIGS = {
    'pcpmodel_70_30_split.h5': {'type': 'pcp', 'split': '70_30'},
    'pcpmodel_80_20_split.h5': {'type': 'pcp', 'split': '80_20'},
    'pcpmodel_90_10_split.h5': {'type': 'pcp', 'split': '90_10'},
    'robust_model_70_30_split.h5': {'type': 'robust', 'split': '70_30'},
    'robust_model_80_20_split.h5': {'type': 'robust', 'split': '80_20'},
    'robust_model_90_10_split.h5': {'type': 'robust', 'split': '90_10'}
}

# Chord list for reference
CHORD_LIST = ['Cmaj', 'Cmin', 'C#maj', 'C#min', 'Dmaj', 'Dmin', 'D#maj', 'D#min', 
              'Emaj', 'Emin', 'Fmaj', 'Fmin', 'F#maj', 'F#min', 'Gmaj', 'Gmin', 
              'G#maj', 'G#min', 'Amaj', 'Amin', 'A#maj', 'A#min', 'Bmaj', 'Bmin']

# Create output directory for evaluation results
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = f'evaluation_results_{timestamp}'
os.makedirs(output_dir, exist_ok=True)

In [3]:
def load_and_preprocess_data(model_type, split):
    """Load appropriate test data based on model type and split"""
    
    if model_type == 'pcp':
        folder = f"extracted_pcp_annotations_12_bin"
        X, y = load_data_from_folder(folder)
        return y, X  # Swap to match expected format
    else:  # robust
        folder = f"extracted_robust_45_annotations"
        X, y = load_robust_data(folder)
        return y,X

def calculate_metrics(y_true, y_pred, model_name):
    """Calculate comprehensive metrics for model evaluation"""
    accuracy = accuracy_score(y_true.argmax(axis=1), y_pred.argmax(axis=1))
    precision, recall, f1, _ = precision_recall_fscore_support(
        y_true.argmax(axis=1), 
        y_pred.argmax(axis=1), 
        average='weighted'
    )
    
    # ROC AUC scores
    auc_micro = roc_auc_score(y_true, y_pred, average='micro')
    auc_macro = roc_auc_score(y_true, y_pred, average='macro')
    
    # Per-chord AUC
    per_chord_auc = {}
    for i, chord in enumerate(CHORD_LIST):
        per_chord_auc[chord] = roc_auc_score(y_true[:, i], y_pred[:, i])
    
    # Confusion matrix
    conf_matrix = confusion_matrix(y_true.argmax(axis=1), y_pred.argmax(axis=1))
    
    return {
        'model_name': model_name,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'auc_micro': auc_micro,
        'auc_macro': auc_macro,
        'per_chord_auc': per_chord_auc,
        'confusion_matrix': conf_matrix
    }

def measure_inference_time(model, X_test, batch_sizes=[1, 8, 16, 32]):
    """Measure inference time across different batch sizes"""
    timing_results = {}
    
    for batch_size in batch_sizes:
        times = []
        num_batches = min(100, len(X_test) // batch_size)
        
        for i in range(num_batches):
            batch = X_test[i*batch_size:(i+1)*batch_size]
            start_time = time.time()
            model.predict(batch, verbose=0)
            end_time = time.time()
            times.append(end_time - start_time)
        
        timing_results[batch_size] = {
            'mean_time': np.mean(times),
            'std_time': np.std(times),
            'min_time': np.min(times),
            'max_time': np.max(times)
        }
    
    return timing_results

In [4]:
def plot_confusion_matrix(conf_matrix, model_name, output_dir):
    """Plot and save confusion matrix heatmap"""
    plt.figure(figsize=(20, 20))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
                xticklabels=CHORD_LIST, yticklabels=CHORD_LIST)
    plt.title(f'Confusion Matrix - {model_name}')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.xticks(rotation=45)
    plt.yticks(rotation=45)
    plt.tight_layout()
    plt.savefig(f'{output_dir}/{model_name}_confusion_matrix.png')
    plt.close()

def plot_auc_comparison(metrics_dict, output_dir):
    """Plot AUC comparison across all models"""
    plt.figure(figsize=(15, 8))
    
    models = list(metrics_dict.keys())
    x = np.arange(len(CHORD_LIST))
    width = 0.8 / len(models)
    
    for i, (model_name, metrics) in enumerate(metrics_dict.items()):
        aucs = [metrics['per_chord_auc'][chord] for chord in CHORD_LIST]
        plt.bar(x + i*width - width*len(models)/2, aucs, width, label=model_name)
    
    plt.ylabel('AUC Score')
    plt.title('Per-Chord AUC Comparison Across Models')
    plt.xticks(x, CHORD_LIST, rotation=45, ha='right')
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(f'{output_dir}/auc_comparison.png')
    plt.close()

In [5]:
# Debug cell - Run this first to verify data loading
for model_type in ['pcp', 'robust']:
    for split in ['70_30', '80_20', '90_10']:
        print(f"\nTesting {model_type} data loading for split {split}:")
        X, y = load_and_preprocess_data(model_type, split)
        if X is not None:
            print(f"X shape: {X.shape}")
            print(f"y shape: {y.shape}")


Testing pcp data loading for split 70_30:
X shape: (272279, 12)
y shape: (272279, 24)

Testing pcp data loading for split 80_20:
X shape: (272279, 12)
y shape: (272279, 24)

Testing pcp data loading for split 90_10:
X shape: (272279, 12)
y shape: (272279, 24)

Testing robust data loading for split 70_30:
X shape: (286913, 46)
y shape: (286913, 24)

Testing robust data loading for split 80_20:
X shape: (286913, 46)
y shape: (286913, 24)

Testing robust data loading for split 90_10:
X shape: (286913, 46)
y shape: (286913, 24)


In [6]:
# Main evaluation loop
all_metrics = {}
all_timing = {}

# Helper function to create model architecture
def create_model_architecture(model_type):
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Dense, BatchNormalization, Dropout, InputLayer
    
    if model_type == 'pcp':
        model = Sequential([
            InputLayer(input_shape=(12,)),
            Dense(24, activation='relu'),
            BatchNormalization(),
            Dense(48, activation='relu'),
            BatchNormalization(),
            Dropout(0.2),
            Dense(48, activation='relu'),
            BatchNormalization(),
            Dropout(0.2),
            Dense(24, activation='sigmoid')
        ])
    else:  # robust model
        model = Sequential([
        InputLayer(input_shape=(46,)),
        Dense(64, activation='elu'),
        BatchNormalization(),
        Dropout(0.2),
        Dense(96, activation='elu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(128, activation='elu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(64, activation='elu'),
        BatchNormalization(),
        Dropout(0.2),
        Dense(24, activation='sigmoid')
    ])
    
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return model

# Evaluate each model
for model_name, config in MODEL_CONFIGS.items():
    print(f"\nEvaluating {model_name}...")
    
    try:
        # First try loading the model directly
        model_path = f"models/{model_name}"
        try:
            model = load_model(model_path)
        except:
            # If direct loading fails, create model and load weights
            print(f"Direct model loading failed for {model_name}, trying weights loading...")
            model = create_model_architecture(config['type'])
            model.load_weights(model_path)
    
        # Load appropriate test data
        X_test, y_test = load_and_preprocess_data(config['type'], config['split'])
        
        # Make predictions
        predictions = model.predict(X_test, verbose=0)
        
        # Calculate metrics
        metrics = calculate_metrics(y_test, predictions, model_name)
        all_metrics[model_name] = metrics
        
        # Measure inference time
        timing = measure_inference_time(model, X_test)
        all_timing[model_name] = timing
        
        # Plot confusion matrix
        plot_confusion_matrix(metrics['confusion_matrix'], model_name, output_dir)
        
        print(f"Completed evaluation for {model_name}")
        
    except Exception as e:
        print(f"Error evaluating {model_name}: {str(e)}")
        continue


Evaluating pcpmodel_70_30_split.h5...


2024-11-24 20:39:14.174427: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-11-24 20:39:14.200110: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-11-24 20:39:14.200179: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-11-24 20:39:14.203829: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-11-24 20:39:14.203943: I external/local_xla/xla/stream_executor

Completed evaluation for pcpmodel_70_30_split.h5

Evaluating pcpmodel_80_20_split.h5...
Direct model loading failed for pcpmodel_80_20_split.h5, trying weights loading...
Completed evaluation for pcpmodel_80_20_split.h5

Evaluating pcpmodel_90_10_split.h5...
Completed evaluation for pcpmodel_90_10_split.h5

Evaluating robust_model_70_30_split.h5...
Direct model loading failed for robust_model_70_30_split.h5, trying weights loading...
Completed evaluation for robust_model_70_30_split.h5

Evaluating robust_model_80_20_split.h5...
Direct model loading failed for robust_model_80_20_split.h5, trying weights loading...
Completed evaluation for robust_model_80_20_split.h5

Evaluating robust_model_90_10_split.h5...
Direct model loading failed for robust_model_90_10_split.h5, trying weights loading...
Completed evaluation for robust_model_90_10_split.h5


In [7]:
#print(metrics)
#print(all_metrics)

In [8]:
# Prepare results DataFrame
results = []
for model_name, metrics in all_metrics.items():
    result = {
        'Model': model_name,
        'Accuracy': metrics['accuracy'],
        'Precision': metrics['precision'],
        'Recall': metrics['recall'],
        'F1 Score': metrics['f1'],
        'AUC (Micro)': metrics['auc_micro'],
        'AUC (Macro)': metrics['auc_macro']
    }
    
    # Add timing results
    for batch_size, timing in all_timing[model_name].items():
        result[f'Inference Time (ms) - Batch {batch_size}'] = timing['mean_time'] * 1000
    
    results.append(result)

# Save overall results
pd.DataFrame(results).to_csv(f'{output_dir}/model_comparison_results.csv', index=False)

# Save per-chord performance
chord_results = []
for model_name, metrics in all_metrics.items():
    for chord in CHORD_LIST:
        chord_results.append({
            'Model': model_name,
            'Chord': chord,
            'AUC': metrics['per_chord_auc'][chord]
        })

pd.DataFrame(chord_results).to_csv(f'{output_dir}/per_chord_performance.csv', index=False)

print(f"\nEvaluation complete. Results saved in {output_dir}/")


Evaluation complete. Results saved in evaluation_results_20241124_203829/
