# Phase 2: Expressive Techniques Classification 

## A) "Initial experiments"

### Recreate our baseline architecture (cred: Stefani et. al.)

In [None]:
# Step 1: Gather Dependencies (ensure these are installed)
import numpy as np
import librosa
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import InputLayer, Dense, BatchNormalization
from sklearn.model_selection import KFold

# ==============================================================================
# Step 2: Prepare Your Data Pipeline
# ==============================================================================

def extract_features_from_audio(file_path, n_mfcc=20, n_chroma=12, n_spectral_contrast=7):
    """
    Extracts a 180-dimensional feature vector from an audio file.
    This is an example implementation; the exact features should be tuned.

    To get to 180 dimensions, we concatenate:
    - MFCC (mean + std) = 20 * 2 = 40
    - Chroma STFT (mean + std) = 12 * 2 = 24
    - Spectral Contrast (mean + std) = 7 * 2 = 14
    - Spectral Centroid (mean + std) = 1 * 2 = 2
    - Spectral Bandwidth (mean + std) = 1 * 2 = 2
    - Spectral Rolloff (mean + std) = 1 * 2 = 2
    - Zero Crossing Rate (mean + std) = 1 * 2 = 2
    - Mel Spectrogram (mean + std) = 128 * 1 = 128 (using only mean for this one to save space)
    Total dimensions: 40 + 24 + 14 + 2 + 2 + 2 + 2 = 86.

    To reach 180, we need more features. Let's use larger MFCCs and Mel Spectrograms.
    New plan:
    - MFCC (mean + std) = 40 * 2 = 80
    - Mel Spectrogram (mean + std) = 40 * 2 = 80
    - Chroma STFT (mean + std) = 10 * 2 = 20
    Total = 180.
    """
    try:
        y, sr = librosa.load(file_path, mono=True)

        # MFCCs (80 dims)
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
        mfccs_mean = np.mean(mfccs, axis=1)
        mfccs_std = np.std(mfccs, axis=1)

        # Mel Spectrogram (80 dims)
        mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=40)
        mel_spec_mean = np.mean(mel_spec, axis=1)
        mel_spec_std = np.std(mel_spec, axis=1)

        # Chroma Features (20 dims)
        chroma = librosa.feature.chroma_stft(y=y, sr=sr, n_chroma=10)
        chroma_mean = np.mean(chroma, axis=1)
        chroma_std = np.std(chroma, axis=1)

        # Concatenate all features to create the final 180-dim vector
        feature_vector = np.concatenate([
            mfccs_mean, mfccs_std,
            mel_spec_mean, mel_spec_std,
            chroma_mean, chroma_std
        ])

        return feature_vector
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

# ==============================================================================
# Step 3: Implement Model A in TensorFlow/Keras
# ==============================================================================

def create_model_A(input_shape=(180,), num_classes=8):
    """
    Creates and compiles the Keras model described as Model A.
    """
    model = Sequential(name="Model_A_Stefani_et_al")
    model.add(InputLayer(input_shape=input_shape))

    # Four hidden layers
    for _ in range(4):
        model.add(Dense(800, activation='relu'))
        model.add(BatchNormalization())

    # Output layer
    model.add(Dense(num_classes, activation='softmax'))

    # Compile the model
    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

# Create an instance of the model and print its summary
model = create_model_A()
model.summary()

In [None]:
import tensorflow as tf

# This command lists all physical devices visible to TensorFlow
gpu_devices = tf.config.list_physical_devices('GPU')

if gpu_devices:
    print(f"TensorFlow has found {len(gpu_devices)} GPU(s):")
    for device in gpu_devices:
        print(f" - {device}")
    # When you run your training, TensorFlow will automatically print
    # log messages indicating it's creating tensors on the GPU.
else:
    print("‼️ TensorFlow did NOT find any GPUs.")
    print("Training will run on the CPU, which will be much slower.")


In [None]:
# ==============================================================================
# Step 4: Data Loading, Splitting, and Training (Full Implementation)
# ==============================================================================
import os
import json
from tqdm import tqdm
from tensorflow.keras.callbacks import ModelCheckpoint
os.makedirs('models', exist_ok=True)

def load_and_split_data(data_dir, folds_json_path):
    """
    Loads all audio, extracts features, and splits data according to a JSON file.
    """
    # 1. Create a mapping from class name (folder name) to integer index
    label_names = sorted([d for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))])
    label_map = {name: i for i, name in enumerate(label_names)}
    print(f"Found {len(label_map)} classes: {label_names}")

    # 2. Process ALL wav files and store features/labels in a dictionary.
    #    This pre-computation makes it easy to look up data for the splits later.
    all_data = {}
    print("\nProcessing all audio files... this may take a while.")
    for label_name, label_idx in label_map.items():
        class_dir = os.path.join(data_dir, label_name)
        for fname in tqdm(os.listdir(class_dir), desc=f"Processing '{label_name}'"):
            if fname.endswith('.wav'):
                fpath = os.path.join(class_dir, fname)
                feature_vec = extract_features_from_audio(fpath)
                if feature_vec is not None and feature_vec.shape[0] == 180:
                    # Store feature vector and label index, keyed by the simple filename
                    all_data[fname] = (feature_vec, label_idx)

    print(f"\nSuccessfully processed {len(all_data)} audio files.")

    # 3. Load the JSON file that defines the train/test splits
    with open(folds_json_path, 'r') as f:
        splits = json.load(f)
    
    # 4. Create the dataset for each fold using the pre-computed features
    folded_data = []
    print("Building datasets for each fold based on JSON splits...")
    for fold_name in sorted(splits.keys()):
        train_files = splits[fold_name]['train']
        test_files = splits[fold_name]['test']

        X_train, y_train = [], []
        for fname in train_files:
            if fname in all_data:
                features, label = all_data[fname]
                X_train.append(features)
                y_train.append(label)

        X_test, y_test = [], []
        for fname in test_files:
            if fname in all_data:
                features, label = all_data[fname]
                X_test.append(features)
                y_test.append(label)
        
        folded_data.append({
            'fold_name': fold_name,
            'X_train': np.array(X_train), 'y_train': np.array(y_train),
            'X_test': np.array(X_test), 'y_test': np.array(y_test)
        })
        print(f" -> Loaded {fold_name}: {len(X_train)} train samples, {len(X_test)} test samples.")
        
    return folded_data, len(label_map)

# --- Main Execution ---
DATA_ROOT = ""
FOLDS_JSON = ""

try:
    all_folded_data, num_classes = load_and_split_data(DATA_ROOT, FOLDS_JSON)
    
    # --- Training Loop ---
    histories = []
    for i, fold in enumerate(all_folded_data):
        print(f"\n----------- TRAINING FOLD {i+1}/{len(all_folded_data)} ({fold['fold_name']}) -----------")
        
        X_train, y_train = fold['X_train'], fold['y_train']
        X_test, y_test = fold['X_test'], fold['y_test']
        
        # One-hot encode the labels for the current fold
        y_train_cat = tf.keras.utils.to_categorical(y_train, num_classes=num_classes)
        y_test_cat = tf.keras.utils.to_categorical(y_test, num_classes=num_classes)
        
        # Create a new, fresh model for each fold to avoid information leakage
        model = create_model_A(num_classes=num_classes)

        # This will save the best model of this fold to a uniquely named file
        checkpoint_filepath = f'models/best_model_fold_{i+1}.h5'
        model_checkpoint_callback = ModelCheckpoint(
            filepath=checkpoint_filepath,
            save_weights_only=False,  # Save the full model (architecture + weights)
            monitor='val_accuracy',   # The metric to monitor
            mode='max',               # We want to MAXIMIZE accuracy
            save_best_only=True,      # Only save when the metric improves
            verbose=1)

        # Fit data to model, now including the callback
        history = model.fit(X_train, y_train_cat,
                            batch_size=64,
                            epochs=50, 
                            validation_data=(X_test, y_test_cat),
                            verbose=1,
                            callbacks=[model_checkpoint_callback]) # Pass the callback here
        
        histories.append(history)

    print("\n✅ Completed cross-validation training.")

except FileNotFoundError:
    print("="*60)
    print("‼️ ERROR: Data directory or folds.json not found.")
    print(f"Please ensure your data is in a folder named '{DATA_ROOT}'")
    print(f"and your splits file is at '{FOLDS_JSON}'.")
    print("Cannot proceed with training.")
    print("="*60)

### Baseline + regularization techniques

In [None]:
# ==============================================================================
# Step 1: All Imports
# ==============================================================================
import os
import json
import numpy as np
import librosa
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import InputLayer, Dense, BatchNormalization, Dropout
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.model_selection import KFold
from tqdm import tqdm

# ==============================================================================
# Step 2: Feature Extraction Function
# ==============================================================================
def extract_features_from_audio(file_path, n_mfcc=40, n_mels=40, n_chroma=10):
    """
    Extracts a 180-dimensional feature vector from an audio file.
    This combines the mean and standard deviation of MFCCs, Mel Spectrograms,
    and Chroma features to create the final vector.
    """
    try:
        y, sr = librosa.load(file_path, mono=True)
        
        # MFCCs (mean + std = 80 dims)
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
        mfccs_mean = np.mean(mfccs, axis=1)
        mfccs_std = np.std(mfccs, axis=1)

        # Mel Spectrogram (mean + std = 80 dims)
        mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
        mel_spec_mean = np.mean(mel_spec, axis=1)
        mel_spec_std = np.std(mel_spec, axis=1)

        # Chroma Features (mean + std = 20 dims)
        chroma = librosa.feature.chroma_stft(y=y, sr=sr, n_chroma=n_chroma)
        chroma_mean = np.mean(chroma, axis=1)
        chroma_std = np.std(chroma, axis=1)

        # Concatenate all features to create the final 180-dim vector
        feature_vector = np.concatenate([
            mfccs_mean, mfccs_std,
            mel_spec_mean, mel_spec_std,
            chroma_mean, chroma_std
        ])
        return feature_vector
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

# ==============================================================================
# Step 3: Regularized Model Creation Function
# ==============================================================================
def create_model_A(input_shape=(180,), num_classes=9, dropout_rate=0.4, l2_lambda=1e-4):
    """
    Creates and compiles the Keras model with Dropout and L2 regularization.
    """
    model = Sequential(name="Model_A_Regularized")
    model.add(InputLayer(input_shape=input_shape))

    # Four hidden layers with regularization
    for _ in range(4):
        model.add(Dense(800, activation='relu', kernel_regularizer=l2(l2_lambda)))
        model.add(BatchNormalization())
        model.add(Dropout(dropout_rate))

    # Output layer
    model.add(Dense(num_classes, activation='softmax'))

    # Compile the model
    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

# ==============================================================================
# Step 4: Data Loading and Splitting Function
# ==============================================================================
def load_and_split_data(data_dir, folds_json_path):
    """
    Loads all audio, extracts features, and splits data according to a JSON file.
    """
    label_names = sorted([d for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))])
    label_map = {name: i for i, name in enumerate(label_names)}
    print(f"Found {len(label_map)} classes: {label_names}")

    all_data = {}
    print("\nProcessing all audio files... this may take a while.")
    for label_name, label_idx in label_map.items():
        class_dir = os.path.join(data_dir, label_name)
        for fname in tqdm(os.listdir(class_dir), desc=f"Processing '{label_name}'"):
            if fname.endswith('.wav'):
                fpath = os.path.join(class_dir, fname)
                feature_vec = extract_features_from_audio(fpath)
                if feature_vec is not None and feature_vec.shape[0] == 180:
                    all_data[fname] = (feature_vec, label_idx)

    print(f"\nSuccessfully processed {len(all_data)} audio files.")

    with open(folds_json_path, 'r') as f:
        splits = json.load(f)
    
    folded_data = []
    print("Building datasets for each fold based on JSON splits...")
    for fold_name in sorted(splits.keys()):
        train_files = splits[fold_name]['train']
        test_files = splits[fold_name]['test']

        X_train, y_train = [], []
        for fname in train_files:
            if fname in all_data:
                features, label = all_data[fname]
                X_train.append(features)
                y_train.append(label)

        X_test, y_test = [], []
        for fname in test_files:
            if fname in all_data:
                features, label = all_data[fname]
                X_test.append(features)
                y_test.append(label)
        
        folded_data.append({
            'fold_name': fold_name,
            'X_train': np.array(X_train), 'y_train': np.array(y_train),
            'X_test': np.array(X_test), 'y_test': np.array(y_test)
        })
        print(f" -> Loaded {fold_name}: {len(X_train)} train samples, {len(X_test)} test samples.")
        
    return folded_data, len(label_map)


In [None]:
# ==============================================================================
# Step 5: Main Training Execution
# ==============================================================================

# --- Configuration ---
DATA_ROOT = ""
FOLDS_JSON = ""
MODEL_OUTPUT_DIR = 'models_regularized'
os.makedirs(MODEL_OUTPUT_DIR, exist_ok=True) 

# --- Verification and Training ---
try:
    # Print model summary to verify architecture
    print("Model Architecture:")
    model_instance = create_model_A()
    model_instance.summary()

    # Load data
    all_folded_data, num_classes = load_and_split_data(DATA_ROOT, FOLDS_JSON)
    print("num_classes:", num_classes)
    
    # --- Training Loop ---
    histories = []
    for i, fold_data in enumerate(all_folded_data):
        fold_name = fold_data['fold_name']
        print(f"\n----------- TRAINING FOLD {i+1}/{len(all_folded_data)} ({fold_name}) -----------")
        
        X_train, y_train = fold_data['X_train'], fold_data['y_train']
        X_test, y_test = fold_data['X_test'], fold_data['y_test']
        
        y_train_cat = tf.keras.utils.to_categorical(y_train, num_classes=num_classes)
        y_test_cat = tf.keras.utils.to_categorical(y_test, num_classes=num_classes)
        
        model = create_model_A(num_classes=num_classes, dropout_rate=0.5, l2_lambda=1e-3)

        checkpoint_filepath = os.path.join(MODEL_OUTPUT_DIR, f'best_model_{fold_name}.h5')
        model_checkpoint_callback = ModelCheckpoint(
            filepath=checkpoint_filepath,
            save_weights_only=False,
            monitor='val_accuracy',
            mode='max',
            save_best_only=True)
            
        early_stopping_callback = EarlyStopping(
            monitor='val_loss',
            patience=10,
            verbose=1,
            restore_best_weights=True)

        history = model.fit(X_train, y_train_cat,
                            batch_size=64,
                            epochs=1000,
                            validation_data=(X_test, y_test_cat),
                            verbose=0,
                            callbacks=[model_checkpoint_callback, early_stopping_callback])
        
        histories.append(history)

    print(f"\n✅ Completed cross-validation training. Best models saved in '{MODEL_OUTPUT_DIR}/'.")

except FileNotFoundError:
    print("="*60)
    print("‼️ ERROR: Data directory or folds.json not found.")
    print(f"Please ensure your data is in a folder named '{DATA_ROOT}'")
    print(f"and your splits file is at '{FOLDS_JSON}'.")
    print("Cannot proceed with training.")
    print("="*60)

### Inference on IDMT-SMT-Guitar dataset

In [None]:
import os
import xml.etree.ElementTree as ET
import numpy as np
import librosa
import tensorflow as tf
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from tqdm import tqdm

# ==============================================================================
# 0. Configuration
# ==============================================================================

MODEL_TO_EVALUATE = ""
IDMT_AUDIO_DIR = ""
IDMT_XML_DIR = ""

MAGCIL_CLASSES = [
    'alternate picking', 'bend', 'hammer on', 'legato', 'pull off',
    'slide', 'sweep picking', 'tapping', 'vibrato', 'other'  # Ensure 'other' included
]

EVAL_CLASSES = ['bend', 'slide', 'vibrato', 'other']
EVAL_CLASS_INDICES = [MAGCIL_CLASSES.index(c) for c in EVAL_CLASSES]
OTHER_IDX = MAGCIL_CLASSES.index('other')

IDMT_TO_MAGCIL_IDX = {
    'be': MAGCIL_CLASSES.index('bend'),
    'sl': MAGCIL_CLASSES.index('slide'),
    'vi': MAGCIL_CLASSES.index('vibrato'),
    # All other common codes mapped to 'other'
    'pi': OTHER_IDX, 'ha': OTHER_IDX, 'dn': OTHER_IDX, 'no': OTHER_IDX,
    'fs': OTHER_IDX, 'mu': OTHER_IDX, # Add more if needed
}

print(f"Model evaluated on these classes: {EVAL_CLASSES}")
print(f"IDMT to model index mapping: {IDMT_TO_MAGCIL_IDX}")

# ==============================================================================
# 1. Helper Functions
# ==============================================================================

def extract_features_from_segmented_audio(y_segment, sr):
    try:
        if len(y_segment) < 2048:
            return None
        mfccs = librosa.feature.mfcc(y=y_segment, sr=sr, n_mfcc=40)
        mfccs_mean, mfccs_std = np.mean(mfccs, axis=1), np.std(mfccs, axis=1)
        mel_spec = librosa.feature.melspectrogram(y=y_segment, sr=sr, n_mels=40)
        mel_spec_mean, mel_spec_std = np.mean(mel_spec, axis=1), np.std(mel_spec, axis=1)
        chroma = librosa.feature.chroma_stft(y=y_segment, sr=sr, n_chroma=10)
        chroma_mean, chroma_std = np.mean(chroma, axis=1), np.std(chroma, axis=1)
        feature_vector = np.concatenate([
            mfccs_mean, mfccs_std, mel_spec_mean, mel_spec_std, chroma_mean, chroma_std
        ])
        return feature_vector
    except Exception:
        return None

def parse_idmt_xml(xml_path):
    try:
        tree = ET.parse(xml_path)
        root = tree.getroot()
        for event in root.findall('.//event'):
            onset_tag = event.find('onsetSec')
            offset_tag = event.find('offsetSec')
            technique_tag = event.find('expressionStyle')
            if onset_tag is not None and offset_tag is not None and technique_tag is not None:
                onset = float(onset_tag.text)
                offset = float(offset_tag.text)
                technique = technique_tag.text.lower().strip()
                yield {'onset': onset, 'offset': offset, 'technique': technique}
    except ET.ParseError:
        return

# ==============================================================================
# 2. Main Evaluation Script
# ==============================================================================

try:
    print(f"\nLoading model for evaluation: {MODEL_TO_EVALUATE}")
    model = tf.keras.models.load_model(MODEL_TO_EVALUATE)
    ground_truth_labels = []
    predicted_labels = []

    xml_files = [f for f in os.listdir(IDMT_XML_DIR) if f.endswith('.xml')]
    print(f"\nFound {len(xml_files)} XML files. Starting event-level evaluation...")

    for xml_file in tqdm(xml_files, desc="Processing Files"):
        xml_path = os.path.join(IDMT_XML_DIR, xml_file)
        audio_filename = xml_file.replace('.xml', '.wav')
        audio_path = os.path.join(IDMT_AUDIO_DIR, audio_filename)
        if not os.path.exists(audio_path): continue

        # Load audio once per file
        y_full, sr = librosa.load(audio_path, sr=None)
        for note_event in parse_idmt_xml(xml_path):
            technique_name = note_event['technique']
            # Map ALL events to class or 'other'
            ground_truth_index = IDMT_TO_MAGCIL_IDX.get(technique_name, OTHER_IDX)
            start_sample = int(note_event['onset'] * sr)
            end_sample = int(note_event['offset'] * sr)
            note_audio_segment = y_full[start_sample:end_sample]
            features = extract_features_from_segmented_audio(note_audio_segment, sr)
            if features is not None:
                features_batch = np.expand_dims(features, axis=0)
                prediction_probs = model.predict(features_batch, verbose=0)
                pred_idx = np.argmax(prediction_probs, axis=1)[0]
                # If model predicts anything outside EVAL set, map to OTHER
                if pred_idx not in EVAL_CLASS_INDICES[:-1]:  # Last is 'other'
                    pred_idx = OTHER_IDX
                ground_truth_labels.append(ground_truth_index)
                predicted_labels.append(pred_idx)

    # ==============================================================================
    # 3. Report Results
    # ==============================================================================
    print("\n--- EVALUATION COMPLETE ---")
    if not ground_truth_labels:
        print("‼️ No valid notes were found to evaluate. Check paths and the mapping.")
    else:
        print(f"Evaluated {len(ground_truth_labels)} note events.")
        print(f"Overall Accuracy: {accuracy_score(ground_truth_labels, predicted_labels):.2%}\n")
        print("Classification Report:")
        print(classification_report(
            ground_truth_labels, predicted_labels, labels=EVAL_CLASS_INDICES, target_names=EVAL_CLASSES, zero_division=0))
        print("\nConfusion Matrix:")
        print(confusion_matrix(ground_truth_labels, predicted_labels, labels=EVAL_CLASS_INDICES))

except FileNotFoundError:
    print("="*60)
    print(f"‼️ ERROR: Model or Dataset Not Found! Check paths: '{MODEL_TO_EVALUATE}', '{IDMT_AUDIO_DIR}', '{IDMT_XML_DIR}'")
    print("="*60)


### Final output is not great (shows severe distributional shift)

| Technique         | Test Precision | Test Recall | Test F1  | Test Support | Train Precision | Train Recall | Train F1 | Train Support |
|-------------------|:-------------:|:-----------:|:--------:|:------------:|:--------------:|:------------:|:--------:|:-------------:|
| bend              | 0.01          | 0.57        | 0.02     | 54           | 1.00           | 1.00         | 1.00     | 9             |
| slide             | 0.05          | 0.13        | 0.08     | 93           | 1.00           | 0.89         | 0.94     | 9             |
| vibrato           | 0.00          | 0.00        | 0.00     | 117          | 1.00           | 1.00         | 1.00     | 9             |
| alternate picking |                |             |          |              | 1.00           | 1.00         | 1.00     | 18            |
| hammer on         |                |             |          |              | 0.73           | 0.89         | 0.80     | 9             |
| legato            |                |             |          |              | 1.00           | 1.00         | 1.00     | 18            |
| pull off          |                |             |          |              | 0.86           | 0.67         | 0.75     | 9             |
| sweep picking     |                |             |          |              | 1.00           | 0.89         | 0.94     | 9             |
| tapping           |                |             |          |              | 0.90           | 1.00         | 0.95     | 18            |
| other             | 0.98           | 0.32        | 0.49     | 4013         |                |              |          |               |
|-------------------|---------------|-------------|----------|--------------|----------------|--------------|----------|---------------|
| accuracy          |               |             | 0.31     | 4277         |                |              | 0.94     | 108           |
| macro avg         | 0.26           | 0.26        | 0.15     | 4277         | 0.94           | 0.93         | 0.93     | 108           |
| weighted avg      | 0.92           | 0.31        | 0.46     | 4277         | 0.95           | 0.94         | 0.94     | 108           |
| Macro F1          |                |             |          |              |                |              | 0.9311   |               |


## B) Create a unified dataset (IDMT, AGPT, Magcil)

### 1. magcil dataset

In [None]:
import os
import shutil
from tqdm import tqdm

# --- Configuration ---

# 1. Set the path to your original MAGCIL dataset
MAGCIL_INPUT_DIR = ""

# 2. Set the path for the new, reorganized dataset
MAGCIL_OUTPUT_DIR = "" # Using a new name to avoid confusion

# 3. Define the NEW mapping from original MAGCIL folder names to the new unified names.
MAGCIL_TO_UNIFIED = {
    # --- New Picking Classes (now separate) ---
    'alternate picking': 'alternate_picking',
    'sweep picking': 'sweep_picking',
    
    # --- Merged Legato Class ---
    'hammer on': 'legato',
    'pull off': 'legato',
    'legato': 'legato',
    
    # --- Direct 1-to-1 Mappings for Primary Classes ---
    'slide': 'slide',
    'bend': 'bend',
    'vibrato': 'vibrato',
    'palm mute': 'palm_mute',
    'staccato': 'staccato',
    'harmonics': 'harmonics',
    
    # NOTE: Any folder in the source directory that is NOT a key in this dictionary
    # (e.g., 'tapping') will be automatically placed in the 'other' folder.
}

# --- Main Processing Script ---

print(f"Reorganizing MAGCIL with new 11-class mapping into '{MAGCIL_OUTPUT_DIR}'")

# Start with a clean directory to ensure no old files remain
if os.path.exists(MAGCIL_OUTPUT_DIR):
    shutil.rmtree(MAGCIL_OUTPUT_DIR)
os.makedirs(MAGCIL_OUTPUT_DIR)

# Determine all unique destination folders from the mapping values
unified_classes = set(MAGCIL_TO_UNIFIED.values())
unified_classes.add('other')   # Add 'other' for any unmapped source folders

# Create all necessary destination directories
for class_name in unified_classes:
    os.makedirs(os.path.join(MAGCIL_OUTPUT_DIR, class_name), exist_ok=True)

try:
    source_class_folders = [d for d in os.listdir(MAGCIL_INPUT_DIR) if os.path.isdir(os.path.join(MAGCIL_INPUT_DIR, d))]
except FileNotFoundError:
    print(f"‼️ ERROR: Input directory not found at '{MAGCIL_INPUT_DIR}'. Please check the path.")
    source_class_folders = []

print(f"Found {len(source_class_folders)} source folders. Starting reorganization...")

# Iterate through each source folder and copy its files based on the mapping
for source_folder in tqdm(source_class_folders, desc="Processing Classes"):
    
    # Use .get() to look up the mapping. If a folder isn't in the map, default to 'other'.
    unified_class = MAGCIL_TO_UNIFIED.get(source_folder, 'other')
    
    source_path = os.path.join(MAGCIL_INPUT_DIR, source_folder)
    destination_path = os.path.join(MAGCIL_OUTPUT_DIR, unified_class)
    
    # Copy all .wav files from the source to the unified destination
    for filename in os.listdir(source_path):
        if filename.endswith('.wav'):
            shutil.copy2(os.path.join(source_path, filename), os.path.join(destination_path, filename))

if source_class_folders:
    print(f"\n✅ Reorganization complete! The new dataset is ready at '{MAGCIL_OUTPUT_DIR}'.")

### 2. IDMT dataset

In [None]:
import os
import xml.etree.ElementTree as ET
import librosa
import soundfile as sf
from tqdm import tqdm

# --- Configuration ---
IDMT_AUDIO_DIR = ""
IDMT_XML_DIR = ""
OUTPUT_DATA_DIR = ""
FIXED_WINDOW_DURATION_SEC = 0.4 # Using the 400ms window as decided

# --- NEW Unified Class Mappings for the 11-class structure ---

# Mapping for <expressionStyle> tags.
# NOTE: IDMT does not distinguish 'hammer on'/'pull off', so we don't have a 'legato' mapping here.
# 'NO' is a special case to trigger the fallback to the excitation style.
IDMT_EXPRESSION_TO_UNIFIED = {
    'BE': 'bend',
    'SL': 'slide',
    'VI': 'vibrato',
    'ST': 'staccato',
    'HA': 'harmonics',
    'NO': None,  # Special marker: If 'NO', check the excitation style instead.
}

# Mapping for <excitationStyle> tags.
# This is ONLY used if the expressionStyle was 'NO'.
# NOTE: IDMT does not distinguish 'alternate' vs 'sweep' picking. We map 'PK' to the general 'picking' class.
IDMT_EXCITATION_TO_UNIFIED = {
    'PK': 'picking',
    'MU': 'palm_mute',
}

# --- Main Processing Script ---
print(f"Processing IDMT dataset for the new 11-class structure.")
print(f"Output will be saved to '{OUTPUT_DATA_DIR}'")

# Start with a clean directory
if os.path.exists(OUTPUT_DATA_DIR):
    shutil.rmtree(OUTPUT_DATA_DIR)
os.makedirs(OUTPUT_DATA_DIR)

# Create directories for all possible unified classes, plus 'other'
all_target_classes = set(IDMT_EXPRESSION_TO_UNIFIED.values()) | set(IDMT_EXCITATION_TO_UNIFIED.values())
all_target_classes.discard(None)
all_target_classes.add('other')
# Manually add the classes that only exist in MAGCIL to ensure the folder structure is identical
all_target_classes.update(['alternate_picking', 'sweep_picking', 'legato'])

for unified_class in all_target_classes:
    os.makedirs(os.path.join(OUTPUT_DATA_DIR, unified_class), exist_ok=True)

# Main loop
try:
    xml_files = [f for f in os.listdir(IDMT_XML_DIR) if f.endswith('.xml')]
    for xml_file in tqdm(xml_files, desc="Processing XML annotations"):
        audio_file = xml_file.replace('.xml', '.wav')
        audio_path = os.path.join(IDMT_AUDIO_DIR, audio_file)
        xml_path = os.path.join(IDMT_XML_DIR, xml_file)

        if not os.path.exists(audio_path):
            continue

        y, sr = librosa.load(audio_path, sr=None)
        
        tree = ET.parse(xml_path)
        root = tree.getroot()
        event_idx = 0
        for event in root.findall('.//event'):
            expression_tag = event.find('expressionStyle')
            excitation_tag = event.find('excitationStyle')

            if expression_tag is None or excitation_tag is None:
                continue

            expression_code = expression_tag.text.strip().upper()
            excitation_code = excitation_tag.text.strip().upper()
            
            unified_class = None
            # Primary case: Check for a specific expressive technique
            if expression_code != 'NO':
                unified_class = IDMT_EXPRESSION_TO_UNIFIED.get(expression_code)
                if unified_class is None: # It's an unmapped expressive technique
                    unified_class = 'other'
            # Fallback case: If expression is 'NO', check the excitation style
            else:
                unified_class = IDMT_EXCITATION_TO_UNIFIED.get(excitation_code)
                if unified_class is None: # It's an unmapped excitation technique
                    unified_class = 'other'

            onset_sec = float(event.find('onsetSec').text)
            offset_sec = float(event.find('offsetSec').text)

            start_sample = int(onset_sec * sr)
            end_sample = start_sample + int(FIXED_WINDOW_DURATION_SEC * sr)
            end_sample = min(end_sample, len(y))
            
            segment = y[start_sample:end_sample]
            
            if len(segment) < 2048:
                continue

            fname = f"{os.path.splitext(audio_file)[0]}_{event_idx}_{unified_class}.wav"
            event_idx += 1
            
            out_dir = os.path.join(OUTPUT_DATA_DIR, unified_class)
            out_path = os.path.join(out_dir, fname)
            sf.write(out_path, segment, sr)

except FileNotFoundError:
    print(f"‼️ ERROR: Could not find IDMT directories. Check paths for audio ('{IDMT_AUDIO_DIR}') and XML ('{IDMT_XML_DIR}').")

print(f"\n✅ All IDMT segments cropped and saved in file-level folders at '{OUTPUT_DATA_DIR}'.")

### 3. AGPT dataset

#### 3.1 quick check to find the median of duration

In [None]:
# a quick sanity-check for the duration of file-level annotations
import os
import librosa
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# --- Configuration ---
# Use the path to your final, reorganized MAGCIL dataset
MAGCIL_UNIFIED_DIR = ""

# --- Step 1: Collect Durations for all Audio Files ---

# We'll store durations in a dictionary to potentially analyze per-class stats later
durations_by_class = {}
all_durations = []

try:
    class_folders = [d for d in os.listdir(MAGCIL_UNIFIED_DIR) if os.path.isdir(os.path.join(MAGCIL_UNIFIED_DIR, d))]
    
    print(f"Analyzing audio file durations in '{MAGCIL_UNIFIED_DIR}'...")

    for class_folder in tqdm(class_folders, desc="Scanning Classes"):
        class_path = os.path.join(MAGCIL_UNIFIED_DIR, class_folder)
        durations_by_class[class_folder] = []
        
        for filename in os.listdir(class_path):
            if filename.endswith('.wav'):
                file_path = os.path.join(class_path, filename)
                try:
                    # librosa.get_duration is very fast as it only reads the file header
                    duration = librosa.get_duration(path=file_path)
                    durations_by_class[class_folder].append(duration)
                    all_durations.append(duration)
                except Exception as e:
                    print(f"Warning: Could not process file {file_path}. Error: {e}")

except FileNotFoundError:
    print(f"‼️ ERROR: Directory not found at '{MAGCIL_UNIFIED_DIR}'. Please check the path.")

# --- Step 2: Calculate and Print Statistics ---

if all_durations:
    # Convert to a NumPy array for efficient calculations
    all_durations = np.array(all_durations)
    
    print("\n--- Overall Duration Statistics for MAGCIL Dataset ---")
    print(f"Total Samples Analyzed: {len(all_durations)}")
    print(f"Mean Duration:      {np.mean(all_durations):.3f} seconds")
    print(f"Median Duration:    {np.median(all_durations):.3f} seconds")
    print(f"Min Duration:       {np.min(all_durations):.3f} seconds")
    print(f"Max Duration:       {np.max(all_durations):.3f} seconds")
    print("-" * 20)
    print(f"25th Percentile:    {np.percentile(all_durations, 25):.3f} seconds")
    print(f"75th Percentile:    {np.percentile(all_durations, 75):.3f} seconds")
    print(f"95th Percentile:    {np.percentile(all_durations, 95):.3f} seconds")
    
    # --- Step 3: Plot the Distribution ---
    
    print("\nGenerating duration histogram...")
    plt.style.use('seaborn-v0_8-whitegrid')
    fig, ax = plt.subplots(figsize=(12, 6))
    
    sns.histplot(all_durations, bins=50, kde=True, ax=ax)
    
    # Add vertical lines for mean and median
    ax.axvline(np.mean(all_durations), color='red', linestyle='--', linewidth=2, label=f'Mean: {np.mean(all_durations):.2f}s')
    ax.axvline(np.median(all_durations), color='green', linestyle='-', linewidth=2, label=f'Median: {np.median(all_durations):.2f}s')
    
    ax.set_title('Distribution of Audio File Durations in MAGCIL Dataset', fontsize=16)
    ax.set_xlabel('Duration (seconds)', fontsize=12)
    ax.set_ylabel('Number of Files', fontsize=12)
    ax.legend()
    
    plt.show()

else:
    print("\nNo audio files were found to analyze.")

In [None]:
import os
import xml.etree.ElementTree as ET
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# --- Configuration ---
# Use the path to your IDMT XML annotations
IDMT_XML_DIR = ""

# --- Step 1: Parse all XML files and collect event durations ---

all_durations = []

try:
    xml_files = [f for f in os.listdir(IDMT_XML_DIR) if f.endswith('.xml')]
    
    print(f"Analyzing note event durations from {len(xml_files)} XML files in '{IDMT_XML_DIR}'...")

    for xml_file in tqdm(xml_files, desc="Parsing XML Files"):
        xml_path = os.path.join(IDMT_XML_DIR, xml_file)
        try:
            tree = ET.parse(xml_path)
            root = tree.getroot()
            for event in root.findall('.//event'):
                onset_tag = event.find('onsetSec')
                offset_tag = event.find('offsetSec')
                
                # Ensure both tags exist to calculate a valid duration
                if onset_tag is not None and offset_tag is not None:
                    onset_sec = float(onset_tag.text)
                    offset_sec = float(offset_tag.text)
                    duration = offset_sec - onset_sec
                    
                    # Add a check for valid durations (e.g., non-negative)
                    if duration > 0:
                        all_durations.append(duration)

        except ET.ParseError as e:
            print(f"Warning: Could not parse file {xml_file}. Error: {e}")

except FileNotFoundError:
    print(f"‼️ ERROR: Directory not found at '{IDMT_XML_DIR}'. Please check the path.")

# --- Step 2: Calculate and Print Statistics ---

if all_durations:
    all_durations = np.array(all_durations)
    
    print("\n--- Note Event Duration Statistics for IDMT Dataset ---")
    print(f"Total Note Events Analyzed: {len(all_durations)}")
    print(f"Mean Duration:      {np.mean(all_durations):.3f} seconds")
    print(f"Median Duration:    {np.median(all_durations):.3f} seconds")
    print(f"Min Duration:       {np.min(all_durations):.3f} seconds")
    print(f"Max Duration:       {np.max(all_durations):.3f} seconds")
    print("-" * 20)
    print(f"25th Percentile:    {np.percentile(all_durations, 25):.3f} seconds")
    print(f"75th Percentile:    {np.percentile(all_durations, 75):.3f} seconds")
    print(f"95th Percentile:    {np.percentile(all_durations, 95):.3f} seconds")
    
    # --- Step 3: Plot the Distribution ---
    
    print("\nGenerating duration histogram...")
    plt.style.use('seaborn-v0_8-whitegrid')
    fig, ax = plt.subplots(figsize=(12, 6))
    
    # We might have a long tail, so let's focus the plot on the bulk of the data
    # For example, plot only durations less than 2 seconds to see the details
    plot_durations = all_durations[all_durations < 2]
    sns.histplot(plot_durations, bins=50, kde=True, ax=ax)
    
    ax.axvline(np.mean(all_durations), color='red', linestyle='--', linewidth=2, label=f'Mean: {np.mean(all_durations):.2f}s')
    ax.axvline(np.median(all_durations), color='green', linestyle='-', linewidth=2, label=f'Median: {np.median(all_durations):.2f}s')
    
    ax.set_title('Distribution of Note Event Durations in IDMT Dataset', fontsize=16)
    ax.set_xlabel('Duration (seconds)', fontsize=12)
    ax.set_ylabel('Number of Notes', fontsize=12)
    ax.legend()
    
    plt.show()

else:
    print("\nNo note events were found to analyze.")

#### 3.2 AGPT dataset

In [None]:
import os
import pandas as pd
import librosa
import soundfile as sf
from tqdm import tqdm
import shutil

# --- Configuration ---

# 1. Path to the root of the aGPTset dataset
AGPT_ROOT_DIR = ""

# 2. Path for the new, reorganized file-level dataset
OUTPUT_DATA_DIR = ""

# 3. Use the 400ms fixed window duration
FIXED_WINDOW_DURATION_SEC = 0.4

# --- Main Processing Script ---

print(f"Processing aGPTset from '{AGPT_ROOT_DIR}' for the 11-class structure.")
print(f"Saving unified file-level data to '{OUTPUT_DATA_DIR}'")

# --- Step 1: Create the mapping from aGPT ID -> Unified Class Name ---

# Load the CSV that maps technique IDs to their names
techniques_csv_path = os.path.join(AGPT_ROOT_DIR, 'metadata', 'expressive_techniques.csv')
techniques_df = pd.read_csv(techniques_csv_path)
id_to_name = techniques_df.set_index('expressive_technique_id')['name'].to_dict()

# Define the NEW mapping from the aGPT technique names to YOUR 11 unified class names
agpt_name_to_unified = {
    'Bending': 'bend',
    'Hammer-on': 'legato', # aGPT 'Hammer-on' maps to the unified 'legato' class
    'Staccato': 'staccato',
    'Vibrato': 'vibrato',
    'Palm Mute': 'palm_mute',
    'Natural Harmonics': 'harmonics',
    'Pick Near Bridge': 'picking',      # aGPT picking types map to the general 'picking' class
    'Pick Over the Soundhole': 'picking',
    # All percussive techniques like 'Kick', 'Snare-A/B', 'Tom' are not in this map,
    # so they will automatically be classified as 'other'.
}

# Combine the two maps to get a final ID -> Unified Class mapping
AGPT_ID_TO_UNIFIED = {}
for tech_id, tech_name in id_to_name.items():
    AGPT_ID_TO_UNIFIED[tech_id] = agpt_name_to_unified.get(tech_name, 'other')

print("\nFinal mapping from aGPT ID to Unified Class:")
print(AGPT_ID_TO_UNIFIED)

# --- Step 2: Create all necessary output directories ---

# Start with a clean directory
if os.path.exists(OUTPUT_DATA_DIR):
    shutil.rmtree(OUTPUT_DATA_DIR)
os.makedirs(OUTPUT_DATA_DIR)

# Explicitly define all 11 target classes to ensure identical folder structure
all_unified_classes = [
    'picking', 'sweep_picking', 'alternate_picking', 'legato', 'slide',
    'bend', 'vibrato', 'palm_mute', 'staccato', 'harmonics', 'other'
]

for unified_class in all_unified_classes:
    os.makedirs(os.path.join(OUTPUT_DATA_DIR, unified_class), exist_ok=True)

# --- Step 3: Process the main note labels CSV ---
note_labels_csv_path = os.path.join(AGPT_ROOT_DIR, 'metadata', 'note_labels.csv')
notes_df = pd.read_csv(note_labels_csv_path)
audio_cache = {}

print(f"\nProcessing {len(notes_df)} labeled notes from 'note_labels.csv'...")

for index, row in tqdm(notes_df.iterrows(), total=len(notes_df), desc="Processing Notes"):
    audio_filename = row['audio_file_path']
    full_audio_path = os.path.join(AGPT_ROOT_DIR, 'data', 'audio', audio_filename)
    
    technique_id = row['expressive_technique_id']
    unified_class = AGPT_ID_TO_UNIFIED.get(technique_id, 'other')

    if audio_filename not in audio_cache:
        if not os.path.exists(full_audio_path):
            continue
        try:
            y, sr = librosa.load(full_audio_path, sr=None)
            audio_cache[audio_filename] = (y, sr)
        except Exception as e:
            print(f"Error loading {audio_filename}: {e}")
            audio_cache[audio_filename] = (None, None)
            continue
    
    y, sr = audio_cache[audio_filename]
    if y is None:
        continue
    
    onset_sec = row['onset_label_seconds']
    start_sample = int(onset_sec * sr)
    end_sample = start_sample + int(FIXED_WINDOW_DURATION_SEC * sr)
    end_sample = min(end_sample, len(y))
    
    segment = y[start_sample:end_sample]
    
    if len(segment) < 2048:
        continue
        
    base_name = os.path.splitext(audio_filename)[0]
    new_fname = f"{base_name}_note{index}_{unified_class}.wav"
    
    out_dir = os.path.join(OUTPUT_DATA_DIR, unified_class)
    out_path = os.path.join(out_dir, new_fname)
    
    try:
        sf.write(out_path, segment, sr)
    except Exception as e:
        print(f"Error writing segment for {new_fname}: {e}")

print(f"\n✅ All labeled aGPTset notes have been cropped and saved to '{OUTPUT_DATA_DIR}'.")

### 4. Visualize class distribution

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# --- Configuration ---
# Point this to the dataset directory you want to analyze.
# For example:
DATASET_DIR = ""

# --- Step 1: Count Samples in Each Class Folder ---

class_counts = {}

try:
    # Get all items in the directory that are folders
    class_folders = [d for d in os.listdir(DATASET_DIR) if os.path.isdir(os.path.join(DATASET_DIR, d))]
    
    print(f"Analyzing class distribution in: '{DATASET_DIR}'\n")

    for class_folder in tqdm(class_folders, desc="Scanning folders"):
        class_path = os.path.join(DATASET_DIR, class_folder)
        # Count only the .wav files in each folder
        num_files = len([f for f in os.listdir(class_path) if f.endswith('.wav')])
        class_counts[class_folder] = num_files

except FileNotFoundError:
    print(f"‼️ ERROR: Directory not found at '{DATASET_DIR}'. Please check the path.")
    class_counts = {}

# --- Step 2: Print a Summary Table ---

if class_counts:
    # Convert to a Pandas DataFrame for easy sorting and printing
    df_counts = pd.DataFrame(list(class_counts.items()), columns=['Class', 'Count'])
    df_counts = df_counts.sort_values('Count', ascending=False).reset_index(drop=True)
    
    print("--- Class Distribution Summary ---")
    print(df_counts)
    print("-" * 30)
    print(f"Total Samples: {df_counts['Count'].sum()}")
    
    # --- Step 3: Generate the Bar Chart Visualization ---
    
    plt.style.use('seaborn-v0_8-whitegrid')
    fig, ax = plt.subplots(figsize=(14, 7))
    
    # Create the bar plot using the sorted DataFrame
    sns.barplot(x='Class', y='Count', data=df_counts, ax=ax, palette='viridis')
    
    # Add the exact count on top of each bar for clarity
    for p in ax.patches:
        ax.annotate(f'{int(p.get_height())}', 
                    (p.get_x() + p.get_width() / 2., p.get_height()), 
                    ha='center', va='center', 
                    xytext=(0, 9), 
                    textcoords='offset points')

    # Formatting the plot
    dataset_name = os.path.basename(DATASET_DIR)
    ax.set_title(f'Class Distribution for {dataset_name}', fontsize=18, weight='bold')
    ax.set_xlabel('Technique Class', fontsize=12)
    ax.set_ylabel('Number of Samples', fontsize=12)
    plt.xticks(rotation=45, ha='right') # Rotate labels to prevent overlap
    plt.tight_layout() # Adjust layout to make sure everything fits
    
    plt.show()

else:
    print("\nNo classes found or directory is empty. Cannot generate visualization.")


### 5. train-test split for IDMT

In [None]:
import os
import shutil
import glob
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# --- Configuration ---

# 1. Path to your source dataset
DATA_ROOT = ""

# 2. Path for the new directory where the splits will be saved
OUTPUT_DIR = ""

# --- Step 1: Discover Classes and Perform Logical Split ---

print("--- Step 1: Performing logical 80/20 split for each class ---")

# FIX: Get the class names DIRECTLY from the DATA_ROOT directory.
# This ensures we are working with the correct folder names for the IDMT dataset.
try:
    idmt_classes = [d for d in os.listdir(DATA_ROOT) if os.path.isdir(os.path.join(DATA_ROOT, d))]
    print(f"Found {len(idmt_classes)} classes in '{DATA_ROOT}': {idmt_classes}")
except FileNotFoundError:
    print(f"‼️ ERROR: Source directory not found at '{DATA_ROOT}'. Please check the path.")
    idmt_classes = []

# Now, use the correct 'idmt_classes' list to find the files
segments_by_class = {lab: sorted(glob.glob(f"{DATA_ROOT}/{lab}/*.wav")) for lab in idmt_classes}

train_files = []
test_files = []
test_fraction = 0.8  # 80% test set

for lab, files in segments_by_class.items():
    if len(files) > 1:
        train, test = train_test_split(files, test_size=test_fraction, random_state=42)
        train_files.extend(train)
        test_files.extend(test)
    elif len(files) == 1:
        train_files.extend(files)
        print(f"Warning: Class '{lab}' has only one sample. Adding it to the training set.")
    else:
        print(f"Warning: Class '{lab}' has no samples. Skipping.")

# Calculate the total number of files found and processed
total_files_found = len(train_files) + len(test_files)

print(f"\nLogical split complete:")
print(f" - Total files processed: {total_files_found}") # This should now be 4000+
print(f" - {len(train_files)} files designated for training (20%).")
print(f" - {len(test_files)} files designated for testing (80%).")


# --- Step 2: Physically Copy Files into New Train/Test Folders ---

# (This part of the code was already correct and needs no changes)
print(f"\n--- Step 2: Copying files into new directory structure at '{OUTPUT_DIR}' ---")

def copy_files_to_split_dir(file_list, split_name, base_output_dir):
    split_dir = os.path.join(base_output_dir, split_name)
    for file_path in tqdm(file_list, desc=f"Copying {split_name} files"):
        class_name = os.path.basename(os.path.dirname(file_path))
        class_dir = os.path.join(split_dir, class_name)
        os.makedirs(class_dir, exist_ok=True)
        shutil.copy2(file_path, class_dir)

if total_files_found > 0:
    if os.path.exists(OUTPUT_DIR):
        shutil.rmtree(OUTPUT_DIR)
    
    copy_files_to_split_dir(train_files, 'train', OUTPUT_DIR)
    copy_files_to_split_dir(test_files, 'test', OUTPUT_DIR)
    
    print(f"\n✅ Physical split complete! Your data is now organized in '{OUTPUT_DIR}'.")
else:
    print("\nNo files were processed. Cannot create split directories.")


### 6. Combine the datasets (train: AGPT, Magcil, IDMT-train, test: IDMT-test)

In [None]:
import os
import shutil
from tqdm import tqdm

# --- Configuration: Define all source and destination paths ---

# 1. Source Directories
MAGCIL_DIR = ""
AGPT_DIR = ""
IDMT_SPLIT_DIR = ""

# 2. Final Output Directory
COMBINED_OUTPUT_DIR = ""

# --- Main Merging Script ---

# A helper function to make the copying process clean and reusable
def merge_dataset(source_dir, dest_dir, dataset_name=""):
    """
    Copies all class folders and .wav files from a source to a destination.
    """
    if not os.path.exists(source_dir):
        print(f"⚠️ Warning: Source directory not found, skipping: {source_dir}")
        return 0
    
    class_folders = [d for d in os.listdir(source_dir) if os.path.isdir(os.path.join(source_dir, d))]
    
    file_copy_count = 0
    for class_folder in tqdm(class_folders, desc=f"Merging {dataset_name} - {os.path.basename(source_dir)}"):
        # --- MODIFICATION TO DROP STACCATO CLASS ---
        if class_folder == 'staccato':
            print(f"Skipping class: {class_folder}")
            continue # This will skip the 'staccato' folder and not copy it.

        source_class_path = os.path.join(source_dir, class_folder)
        dest_class_path = os.path.join(dest_dir, class_folder)
        
        # Create the destination class folder if it doesn't exist
        os.makedirs(dest_class_path, exist_ok=True)
        
        # Copy all .wav files
        for filename in os.listdir(source_class_path):
            if filename.endswith('.wav'):
                shutil.copy2(os.path.join(source_class_path, filename), dest_class_path)
                file_copy_count += 1
    return file_copy_count


print(f"--- Starting dataset merge into '{COMBINED_OUTPUT_DIR}' ---")

# Start with a clean slate for the output directory
if os.path.exists(COMBINED_OUTPUT_DIR):
    print(f"Removing existing directory: {COMBINED_OUTPUT_DIR}")
    shutil.rmtree(COMBINED_OUTPUT_DIR)

# Create the main train and test directories
train_dir = os.path.join(COMBINED_OUTPUT_DIR, 'train')
test_dir = os.path.join(COMBINED_OUTPUT_DIR, 'test')
os.makedirs(train_dir)
os.makedirs(test_dir)

# --- Step 1: Combine all training data ---
print("\n--- Merging TRAINING sets ---")
idmt_train_dir = os.path.join(IDMT_SPLIT_DIR, 'train')

merge_dataset(MAGCIL_DIR, train_dir, "MAGCIL")
merge_dataset(AGPT_DIR, train_dir, "aGPTset")
merge_dataset(idmt_train_dir, train_dir, "IDMT-Train")

# --- Step 2: Add the test data ---
print("\n--- Merging TESTING set ---")
idmt_test_dir = os.path.join(IDMT_SPLIT_DIR, 'test')
merge_dataset(idmt_test_dir, test_dir, "IDMT-Test")

# --- Step 3: Final Verification and Summary ---
def count_files(directory):
    total = 0
    for root, dirs, files in os.walk(directory):
        total += len([f for f in files if f.endswith('.wav')])
    return total

print("\n--- Merge Complete! ---")
total_train_files = count_files(train_dir)
total_test_files = count_files(test_dir)

print(f"Total training files: {total_train_files}")
print(f"Total testing files:  {total_test_files}")
print(f"✅ Your final combined dataset is ready at: '{COMBINED_OUTPUT_DIR}'")

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# --- Configuration ---
# Point this to the root of your final combined dataset
COMBINED_DATASET_ROOT = '/data/hjpark/combined_dataset_final'
TRAIN_DIR = os.path.join(COMBINED_DATASET_ROOT, 'train')
TEST_DIR = os.path.join(COMBINED_DATASET_ROOT, 'test')

def analyze_and_visualize(dataset_dir, plot_title, output_filename):
    """
    Scans a dataset directory, counts samples per class, prints a summary,
    and saves a bar chart visualization.
    """
    print(f"\n--- Analyzing Class Distribution in: '{dataset_dir}' ---")
    
    class_counts = {}
    try:
        class_folders = sorted([d for d in os.listdir(dataset_dir) if os.path.isdir(os.path.join(dataset_dir, d))])
        if not class_folders:
            print("‼️ ERROR: No class subdirectories found.")
            return

        for class_folder in class_folders:
            class_path = os.path.join(dataset_dir, class_folder)
            num_files = len([f for f in os.listdir(class_path) if f.endswith('.wav')])
            class_counts[class_folder] = num_files

    except FileNotFoundError:
        print(f"‼️ ERROR: Directory not found at '{dataset_dir}'. Please run the merge script first.")
        return

    if not class_counts:
        print("No .wav files found to analyze.")
        return

    df_counts = pd.DataFrame(list(class_counts.items()), columns=['Class', 'Count'])
    df_counts = df_counts.sort_values('Count', ascending=False).reset_index(drop=True)
    
    print(f"\n--- {plot_title} ---")
    print(df_counts.to_string())
    print("-" * 35)
    print(f"Total Samples: {df_counts['Count'].sum()}")
    print("-" * 35)
    
    # --- Generate the Bar Chart ---
    plt.style.use('seaborn-v0_8-whitegrid')
    fig, ax = plt.subplots(figsize=(14, 8))
    
    sns.barplot(x='Class', y='Count', data=df_counts, ax=ax, palette='viridis')
    
    for p in ax.patches:
        ax.annotate(f'{int(p.get_height())}', 
                    (p.get_x() + p.get_width() / 2., p.get_height()), 
                    ha='center', va='center', xytext=(0, 9), textcoords='offset points')

    ax.set_title(plot_title, fontsize=18, weight='bold')
    ax.set_xlabel('Unified Technique Class', fontsize=12)
    ax.set_ylabel('Number of Samples', fontsize=12)
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    
    plt.savefig(output_filename)
    print(f"✅ Chart saved to '{output_filename}'")
    plt.close() # Close the figure to free up memory

if __name__ == '__main__':
    # Analyze and plot the training set
    analyze_and_visualize(TRAIN_DIR, 'Final Training Set Distribution', 'combined_train_distribution.png')
    
    # Analyze and plot the testing set
    analyze_and_visualize(TEST_DIR, 'Final Testing Set Distribution', 'combined_test_distribution.png')


## C) Train on the Unified dataset

### Let's fine-tune the model by training on dataset 1 of IDMT

In [None]:
import os
import glob
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import InputLayer, Dense, BatchNormalization, Dropout
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.metrics import classification_report
from tqdm import tqdm
import librosa

# ==============================================================================
# 1. Feature Extraction and Model Definition
# ==============================================================================

def extract_features_from_audio(file_path):
    """
    Extracts a 180-dimensional feature vector from a single audio file.
    (This must be identical to the function used to create the dataset).
    """
    try:
        y, sr = librosa.load(file_path, mono=True)
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
        mfccs_mean, mfccs_std = np.mean(mfccs, axis=1), np.std(mfccs, axis=1)
        mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=40)
        mel_spec_mean, mel_spec_std = np.mean(mel_spec, axis=1), np.std(mel_spec, axis=1)
        chroma = librosa.feature.chroma_stft(y=y, sr=sr, n_chroma=10)
        chroma_mean, chroma_std = np.mean(chroma, axis=1), np.std(chroma, axis=1)
        return np.concatenate([mfccs_mean, mfccs_std, mel_spec_mean, mel_spec_std, chroma_mean, chroma_std])
    except Exception as e:
        # print(f"Error processing {file_path}: {e}") # Uncomment for debugging
        return None

def create_model_A(input_shape=(180,), num_classes=10, dropout_rate=0.4, l2_lambda=1e-4):
    """
    Creates and compiles the Keras model with Dropout and L2 regularization.
    (CORRECTED: The final Dense layer is now correctly placed outside the loop).
    """
    model = Sequential(name="Final_Combined_Model")
    model.add(InputLayer(input_shape=input_shape))
    
    # Four hidden layers
    for _ in range(4):
        model.add(Dense(800, activation='relu', kernel_regularizer=l2(l2_lambda)))
        model.add(BatchNormalization())
        model.add(Dropout(dropout_rate))
        
    # Final output layer (placed correctly AFTER the loop)
    model.add(Dense(num_classes, activation='softmax'))
    
    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

# ==============================================================================
# 2. Data Loading from the Combined Train/Test Directories
# ==============================================================================

def load_data_from_final_dir(directory, label_map):
    """
    Loads pre-processed audio files, extracts features, and returns X, y arrays.
    """
    X, y = [], []
    for class_name, class_idx in label_map.items():
        class_path = os.path.join(directory, class_name)
        if not os.path.exists(class_path):
            print(f"Warning: Class folder not found in {os.path.basename(directory)} set: {class_name}")
            continue
        
        wav_files = glob.glob(os.path.join(class_path, '*.wav'))
        for file_path in tqdm(wav_files, desc=f"Loading {class_name} ({os.path.basename(directory)})"):
            features = extract_features_from_audio(file_path)
            if features is not None:
                X.append(features)
                y.append(class_idx)
    return np.array(X), np.array(y)



In [None]:
# --- Configuration ---
TRAIN_DIR = '/data/hjpark/combined_dataset_augmented/train'
TEST_DIR = '/data/hjpark/combined_dataset_final/test'
MODEL_OUTPUT_DIR = 'models_final'
os.makedirs(MODEL_OUTPUT_DIR, exist_ok=True)

# --- Load Data ---
# Discover class labels from the training directory subfolders
try:
    all_labels = sorted([d for d in os.listdir(TRAIN_DIR) if os.path.isdir(os.path.join(TRAIN_DIR, d))])
    label_map = {name: i for i, name in enumerate(all_labels)}
    num_classes = len(all_labels)
    print(f"Discovered {num_classes} classes: {all_labels}")
except FileNotFoundError:
    print(f"‼️ ERROR: Training directory not found at '{TRAIN_DIR}'. Cannot proceed.")
    all_labels = []


In [None]:
if all_labels:
    X_train, y_train = load_data_from_final_dir(TRAIN_DIR, label_map)
    X_test, y_test = load_data_from_final_dir(TEST_DIR, label_map)
    
    import numpy as np

    print("Saving processed data to disk...")
    # np.save('X_train_final.npy', X_train)
    # np.save('y_train_final.npy', y_train)
    # np.save('X_test_final.npy', X_test)
    # np.save('y_test_final.npy', y_test)
    print("Data saved successfully.")

#### Side quest: Data augmentation

In [None]:
import numpy as np
import librosa
import soundfile as sf
import os
from tqdm import tqdm
# You may need to install this library: pip install audiomentations
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift

# --- Configuration ---
# The root of your final, combined training data
SOURCE_TRAIN_DIR = '/data/hjpark/combined_dataset_final/train'

# A new directory to save the augmented data to
AUGMENTED_TRAIN_DIR = '/data/hjpark/combined_dataset_augmented/train'

# Define which classes to augment and by how much
# The number is the desired number of SAMPLES in that class folder
AUGMENTATION_TARGETS = {
    "alternate_picking": 2000,
    "bend": 2000,
    "sweep_picking": 2000,
    "slide": 2000,
    "vibrato": 2000,
    "harmonics": 2000,
    "legato": 2000,
    # We can also slightly augment 'other' and 'palm_mute' if needed
    "other": 10000, # Let's not make it too big
    "palm_mute": 8000,
    # 'picking' is already large, so we don't need to augment it
    "picking": 0 
}

# Define the augmentation effects
augmenter = Compose([
    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
    TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),
    PitchShift(min_semitones=-2, max_semitones=2, p=0.5),
])

def augment_class_data():
    """
    Applies audio augmentation to classes that are under-represented.
    """
    if not os.path.exists(SOURCE_TRAIN_DIR):
        print(f"ERROR: Source directory not found at {SOURCE_TRAIN_DIR}")
        return

    # First, copy all original files to the new directory
    print(f"Copying original training data to {AUGMENTED_TRAIN_DIR}...")
    shutil.copytree(SOURCE_TRAIN_DIR, AUGMENTED_TRAIN_DIR)
    
    print("\n--- Starting Augmentation ---")
    for class_name, target_count in AUGMENTATION_TARGETS.items():
        class_dir = os.path.join(AUGMENTED_TRAIN_DIR, class_name)
        if not os.path.exists(class_dir):
            print(f"Warning: Class folder '{class_name}' not found. Skipping.")
            continue
        
        original_files = [f for f in os.listdir(class_dir) if f.endswith('.wav')]
        num_to_create = target_count - len(original_files)

        if num_to_create <= 0:
            print(f"'{class_name}' already has enough samples. Skipping.")
            continue
        
        print(f"Augmenting '{class_name}': Creating {num_to_create} new samples...")
        
        for i in tqdm(range(num_to_create)):
            # Pick a random original file to augment
            random_file = np.random.choice(original_files)
            file_path = os.path.join(class_dir, random_file)
            
            y, sr = librosa.load(file_path, sr=None)
            
            # Apply a random augmentation from the 'augmenter' composition
            augmented_audio = augmenter(samples=y, sample_rate=sr)
            
            # Save the new file
            new_filename = f"aug_{i}_{random_file}"
            output_path = os.path.join(class_dir, new_filename)
            sf.write(output_path, augmented_audio, sr)

    print("\n✅ Augmentation complete!")

if __name__ == '__main__':
    augment_class_data()

In [None]:
# Quickly check the class distribution after data augmentation step
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# --- Configuration ---
# Point this to the training set directory that contains the augmented data
AUGMENTED_DATASET_DIR = '/data/hjpark/combined_dataset_augmented/train'

def visualize_augmented_distribution():
    """
    Scans the augmented dataset directory, counts the samples per class,
    and generates a summary table and a bar chart visualization.
    """
    print(f"--- Analyzing Class Distribution in: '{AUGMENTED_DATASET_DIR}' ---")
    
    # --- Step 1: Count Samples in Each Class Folder ---
    class_counts = {}
    try:
        class_folders = [d for d in os.listdir(AUGMENTED_DATASET_DIR) if os.path.isdir(os.path.join(AUGMENTED_DATASET_DIR, d))]
        if not class_folders:
            print(f"‼️ ERROR: No class subdirectories found in '{AUGMENTED_DATASET_DIR}'.")
            return

        print(f"Found {len(class_folders)} class folders. Counting samples...")
        for class_folder in tqdm(class_folders, desc="Scanning folders"):
            class_path = os.path.join(AUGMENTED_DATASET_DIR, class_folder)
            num_files = len([f for f in os.listdir(class_path) if f.endswith('.wav')])
            class_counts[class_folder] = num_files

    except FileNotFoundError:
        print(f"‼️ ERROR: Directory not found at '{AUGMENTED_DATASET_DIR}'. Please run the augmentation script first.")
        return

    # --- Step 2: Print a Summary Table ---
    if not class_counts:
        print("No .wav files found to analyze.")
        return

    df_counts = pd.DataFrame(list(class_counts.items()), columns=['Class', 'Count'])
    df_counts = df_counts.sort_values('Count', ascending=False).reset_index(drop=True)
    
    print("\n--- Augmented Training Set Distribution ---")
    print(df_counts.to_string())
    print("-" * 40)
    print(f"Total Augmented Training Samples: {df_counts['Count'].sum()}")
    print("-" * 40)
    
    # --- Step 3: Generate the Bar Chart Visualization ---
    print("\nGenerating bar chart...")
    plt.style.use('seaborn-v0_8-whitegrid')
    fig, ax = plt.subplots(figsize=(14, 8))
    
    sns.barplot(x='Class', y='Count', data=df_counts, ax=ax, palette='viridis')
    
    # Add the exact count on top of each bar for clarity
    for p in ax.patches:
        ax.annotate(f'{int(p.get_height())}', 
                    (p.get_x() + p.get_width() / 2., p.get_height()), 
                    ha='center', va='center', 
                    xytext=(0, 9), 
                    textcoords='offset points')

    ax.set_title('Class Distribution for Augmented Training Set', fontsize=18, weight='bold')
    ax.set_xlabel('Unified Technique Class', fontsize=12)
    ax.set_ylabel('Number of Samples', fontsize=12)
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    
    # Save the plot to a file
    output_filename = 'augmented_train_distribution.png'
    plt.savefig(output_filename)
    print(f"\n✅ Chart saved to '{output_filename}'")

if __name__ == '__main__':
    visualize_augmented_distribution()


### Continue training

In [None]:
%pwd

In [None]:
# if all_labels:
#     import numpy as np

#     print("Loading pre-processed data from disk...")
#     X_train = np.load('X_train_final.npy')
#     y_train = np.load('y_train_final.npy')
#     X_test = np.load('X_test_final.npy')
#     y_test = np.load('y_test_final.npy')
#     print("Data loaded successfully.")

#     print("\n--- Data Loading Summary ---")
#     print(f"Training samples: {len(X_train)}")
#     print(f"Testing samples:  {len(X_test)}")

#     # ==============================================================================
#     # 3. Model Training and Evaluation
#     # ==============================================================================

#     # --- Prepare Labels (One-Hot Encoding) ---
#     y_train_cat = tf.keras.utils.to_categorical(y_train, num_classes=num_classes)
#     y_test_cat = tf.keras.utils.to_categorical(y_test, num_classes=num_classes)

#     # --- Instantiate the Model ---
#     model = create_model_A(num_classes=num_classes, dropout_rate=0.5, l2_lambda=1e-3)
#     model.summary()

#     # --- Callbacks ---
#     MODEL_SAVE_PATH = os.path.join(MODEL_OUTPUT_DIR, 'best_final_model.h5')
#     checkpoint_callback = ModelCheckpoint(filepath=MODEL_SAVE_PATH, monitor='val_accuracy', mode='max', save_best_only=True, verbose=1)
#     early_stopping_callback = EarlyStopping(monitor='val_loss', patience=10, verbose=1, restore_best_weights=True)

#     # --- Train the Model ---
#     print("\n--- Starting Final Model Training ---")
#     history = model.fit(
#         X_train, y_train_cat,
#         batch_size=32,  # <--- REDUCED FROM 64 to 32
#         epochs=100,
#         validation_data=(X_test, y_test_cat),
#         callbacks=[checkpoint_callback, early_stopping_callback]
#     )

# ==============================================================================
# 2. Model Training with Enhancements
# ==============================================================================
if all_labels:
    import numpy as np
    import tensorflow as tf
    from sklearn.utils import class_weight
    from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

    print("Loading pre-processed AUGMENTED data from disk...")
    # --- CHANGE: Load the augmented training data ---
    X_train = np.load('X_train_final.npy')
    y_train = np.load('y_train_final.npy')
    
    # Test data remains the original
    X_test = np.load('X_test_final.npy')
    y_test = np.load('y_test_final.npy')
    print("Data loaded successfully.")

    print("\n--- Data Loading Summary ---")
    print(f"Augmented training samples: {len(X_train)}")
    print(f"Testing samples:  {len(X_test)}")

    # --- Prepare Labels (One-Hot Encoding) ---
    y_train_cat = tf.keras.utils.to_categorical(y_train, num_classes=num_classes)
    y_test_cat = tf.keras.utils.to_categorical(y_test, num_classes=num_classes)

    # --- NEW: Compute Class Weights ---
    # This will create weights that are inversely proportional to class frequencies
    class_weights = class_weight.compute_class_weight(
        'balanced',
        classes=np.unique(y_train),
        y=y_train
    )
    class_weight_dict = dict(enumerate(class_weights))
    print("\nComputed Class Weights to counter imbalance.")

    # --- Instantiate the Model ---
    # The dropout and L2 values are already strong, no change needed there.
    model = create_model_A(num_classes=num_classes, dropout_rate=0.5, l2_lambda=1e-3)

    # --- NEW: Compile model with a lower learning rate ---
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
    model.compile(
        optimizer=optimizer,
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    model.summary()

    # --- Callbacks ---
    MODEL_SAVE_PATH = os.path.join(MODEL_OUTPUT_DIR, 'best_final_model_augmented.h5') # New model name
    checkpoint_callback = ModelCheckpoint(filepath=MODEL_SAVE_PATH, monitor='val_loss', mode='min', save_best_only=True, verbose=1)
    
    # CHANGE: Increased patience for early stopping
    early_stopping_callback = EarlyStopping(monitor='val_loss', patience=25, verbose=1, restore_best_weights=True)
    
    # NEW: Learning Rate Scheduler
    lr_scheduler_callback = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=7, verbose=1, min_lr=1e-6)

    # --- Train the Model ---
    print("\n--- Starting Final Model Training with Enhancements ---")
    history = model.fit(
        X_train, y_train_cat,
        batch_size=32,
        epochs=1000,
        validation_data=(X_test, y_test_cat),
        class_weight=class_weight_dict, # <-- Apply the class weights
        callbacks=[checkpoint_callback, early_stopping_callback, lr_scheduler_callback] # <-- Add the new scheduler
    )

In [None]:
import numpy as np

# --- 1. Setup ---
# Your trained model makes predictions on the IDMT test data
y_pred_probabilities = model.predict(X_test)
y_pred_from_model = np.argmax(y_pred_probabilities, axis=1)
# This might be an array like: [0, 5, 2, 10, 3, 7, 1, 9 
# Note the presence of 1, 2, 3 which are not in the IDMT ground truth.

# The ground truth labels for your IDMT test set
y_true_idmt = np.load('y_test_final.npy') # e.g. [0, 5, 4, 10, 6, 7, 0, 9

# --- 2. Define the remapping rules ---
# The set of classes that exist in the IDMT test set
allowed_idmt_classes = {0, 4, 5, 6, 7, 8, 9, 10} 
other_class_index = 10

# --- 3. Apply the remapping ---
y_pred_remapped = []
for pred in y_pred_from_model:
    if pred in allowed_idmt_classes:
        y_pred_remapped.append(pred)  # Keep the prediction
    else:
        y_pred_remapped.append(other_class_index) # Remap to "other"

# Convert to numpy array
y_pred_remapped = np.array(y_pred_remapped)

# The remapped array would now look like: [0, 5, 10, 10, 10, 7, 10, 9]

# --- 4. Evaluate ---
# Now you can fairly compare the remapped predictions to the true labels
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(y_true_idmt, y_pred_remapped))
print(confusion_matrix(y_true_idmt, y_pred_remapped))

### Test on the test set

In [None]:
%pwd

In [None]:
import numpy as np
import tensorflow as tf
from sklearn.metrics import classification_report, confusion_matrix
import os
import sys

# --- Configuration ---
# Path to your best-saved model file
MODEL_PATH = './models_final/best_final_model_augmented.h5'

# Paths to your pre-processed, original (non-augmented) test data
X_TEST_PATH = 'X_test_final.npy'
Y_TEST_PATH = 'y_test_final.npy'

# Path to the original training directory to get the class names in the correct order
# Make sure this points to the directory that defines the class order for training
TRAIN_DIR_FOR_LABELS = '/data/hjpark/combined_dataset_final/train'


def evaluate_final_model():
    """
    Loads a trained Keras model and evaluates its performance on the holdout test set.
    """
    # --- 1. Load Model and Test Data ---
    print("--- Loading Model and Data ---")
    try:
        print(f"Loading model from: {MODEL_PATH}")
        model = tf.keras.models.load_model(MODEL_PATH)
        
        print(f"Loading test data from: {X_TEST_PATH} and {Y_TEST_PATH}")
        X_test = np.load(X_TEST_PATH)
        y_test = np.load(Y_TEST_PATH) # These are the ground truth integer labels
    
    except FileNotFoundError as e:
        print(f"‼️ ERROR: A required file was not found. {e}")
        print("Please ensure the model and test .npy files exist at the specified paths.")
        sys.exit(1)
    except Exception as e:
        print(f"An unexpected error occurred during loading: {e}")
        sys.exit(1)
        
    print("✅ Model and data loaded successfully.\n")

    # --- 2. Discover Class Labels for the Report ---
    try:
        # Sort the folder names alphabetically to ensure the order matches the integer labels
        target_names = sorted([d for d in os.listdir(TRAIN_DIR_FOR_LABELS) if os.path.isdir(os.path.join(TRAIN_DIR_FOR_LABELS, d))])
        print(f"Found {len(target_names)} classes for reporting: {target_names}\n")
    except FileNotFoundError:
        print(f"‼️ WARNING: Could not find training directory at '{TRAIN_DIR_FOR_LABELS}' to get class names.")
        print("The classification report will only show integer labels.")
        target_names = None

    # --- 3. Generate Predictions ---
    print("--- Generating Predictions on the Test Set ---")
    # model.predict() returns class probabilities
    y_pred_probabilities = model.predict(X_test)
    
    # Use np.argmax to get the index of the class with the highest probability
    y_pred = np.argmax(y_pred_probabilities, axis=1)
    print("✅ Predictions generated.\n")

    # --- 4. Display Evaluation Metrics ---
    print("--- Final Model Evaluation Report ---")

    if target_names:
        # Define the full set of possible labels (0, 1, ..., 9)
        full_labels = list(range(len(target_names)))

        print("\nClassification Report:")
        # By providing the `labels` parameter, we tell sklearn about all 10 possible classes,
        # fixing the mismatch error.
        report = classification_report(y_test, y_pred, labels=full_labels, target_names=target_names, zero_division=0)
        print(report)
        
        print("\nConfusion Matrix:")
        # Provide the same labels to the confusion matrix for a consistent, full-sized matrix.
        cm = confusion_matrix(y_test, y_pred, labels=full_labels)
        print("Matrix labels:", target_names)
        print(cm)
        print("\n(Rows: True Labels, Columns: Predicted Labels)")
    else:
        # Fallback if target names couldn't be loaded
        print("\nClassification Report (integer labels only):")
        print(classification_report(y_test, y_pred, zero_division=0))
        print("\nConfusion Matrix:")
        print(confusion_matrix(y_test, y_pred))


if __name__ == '__main__':
    evaluate_final_model()


In [None]:
import numpy as np
import tensorflow as tf
from sklearn.metrics import classification_report, confusion_matrix
import os
import sys
import seaborn as sns
import matplotlib.pyplot as plt

# --- Configuration ---
# Path to your best-saved model file
MODEL_PATH = './models_final/best_final_model_augmented.h5'

# Paths to your pre-processed, original (non-augmented) test data
X_TEST_PATH = 'X_test_final.npy'
Y_TEST_PATH = 'y_test_final.npy'

# Path to the original training directory to get the class names in the correct order
# Make sure this points to the directory that defines the class order for training
TRAIN_DIR_FOR_LABELS = '/data/hjpark/combined_dataset_final/train'

# Output paths for visualizations
OUTPUT_DIR = './visualizations'
os.makedirs(OUTPUT_DIR, exist_ok=True)
CM_HEATMAP_PATH = os.path.join(OUTPUT_DIR, 'confusion_matrix.png')
METRICS_BAR_PATH = os.path.join(OUTPUT_DIR, 'metrics_barplot.png')

def evaluate_final_model():
    """
    Loads a trained Keras model and evaluates its performance on the holdout test set.
    Generates filtered classification report (only classes in test set) and visualizations.
    """
    # --- 1. Load Model and Test Data ---
    print("--- Loading Model and Data ---")
    try:
        print(f"Loading model from: {MODEL_PATH}")
        model = tf.keras.models.load_model(MODEL_PATH)
        
        print(f"Loading test data from: {X_TEST_PATH} and {Y_TEST_PATH}")
        X_test = np.load(X_TEST_PATH)
        y_test = np.load(Y_TEST_PATH)  # These are the ground truth integer labels
    
    except FileNotFoundError as e:
        print(f"‼️ ERROR: A required file was not found. {e}")
        print("Please ensure the model and test .npy files exist at the specified paths.")
        sys.exit(1)
    except Exception as e:
        print(f"An unexpected error occurred during loading: {e}")
        sys.exit(1)
        
    print("✅ Model and data loaded successfully.\n")

    # --- 2. Discover Class Labels for the Report ---
    try:
        # Sort the folder names alphabetically to ensure the order matches the integer labels
        target_names = sorted([d for d in os.listdir(TRAIN_DIR_FOR_LABELS) if os.path.isdir(os.path.join(TRAIN_DIR_FOR_LABELS, d))])
        print(f"Found {len(target_names)} classes from training directory: {target_names}\n")
    except FileNotFoundError:
        print(f"‼️ WARNING: Could not find training directory at '{TRAIN_DIR_FOR_LABELS}' to get class names.")
        print("The classification report will only show integer labels.")
        target_names = None

    # --- 3. Generate Predictions ---
    print("--- Generating Predictions on the Test Set ---")
    # model.predict() returns class probabilities
    y_pred_probabilities = model.predict(X_test)
    
    # Use np.argmax to get the index of the class with the highest probability
    y_pred = np.argmax(y_pred_probabilities, axis=1)
    print("✅ Predictions generated.\n")

    # --- 4. Filter to Only Classes Present in Test Set ---
    # Get unique classes that actually appear in y_test (support > 0)
    unique_test_classes = np.unique(y_test)
    print(f"Classes present in test set: {unique_test_classes}")
    
    if target_names:
        # Filter target_names to only those in unique_test_classes
        filtered_target_names = [target_names[i] for i in unique_test_classes]
        # Full labels for CM (but we'll filter the matrix)
        full_labels = list(range(len(target_names)))
    else:
        filtered_target_names = None
        full_labels = unique_test_classes  # Use integers if no names

    # --- 5. Compute Filtered Classification Report and Confusion Matrix ---
    print("--- Final Model Evaluation Report (Filtered to Test Set Classes) ---")
    
    # Classification Report (filtered by providing only unique_test_classes as labels)
    report = classification_report(y_test, y_pred, labels=unique_test_classes, target_names=filtered_target_names, zero_division=0)
    print("\nClassification Report:")
    print(report)
    
    # Confusion Matrix (full, then filter to present classes)
    cm = confusion_matrix(y_test, y_pred, labels=unique_test_classes)  # Only for present classes
    print("\nConfusion Matrix:")
    print("Matrix labels:", filtered_target_names)
    print(cm)
    print("\n(Rows: True Labels, Columns: Predicted Labels)")

    # --- 6. Visualizations ---
    
    # NEW: Abbreviate long class names for a more compact plot
    if filtered_target_names:
        abbreviated_names = {
            'alternate_picking': 'alt_pick', 'sweep_picking': 'swp_pick',
            'palm_mute': 'palm_mute', 'harmonics': 'harmonics',
            'legato': 'legato', 'picking': 'picking',
            'vibrato': 'vibrato', 'slide': 'slide',
            'bend': 'bend', 'other': 'other'
        }
        # Create a new list of labels just for plotting
        plot_labels = [abbreviated_names.get(name, name) for name in filtered_target_names]
    else:
        plot_labels = filtered_target_names

    # ADJUSTED: Figure size made more compact
    plt.figure(figsize=(8, 7))

    # ADJUSTED: Added annot_kws to increase font size of numbers inside the heatmap
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=plot_labels, yticklabels=plot_labels,
                cbar=True, annot_kws={"size": 12}) # Control font size of numbers

    # ADJUSTED: Increased font size for readability
    plt.xlabel('Predicted Labels', fontsize=14)
    plt.ylabel('True Labels', fontsize=14)
    plt.title('Confusion Matrix', fontsize=16)

    # NEW: Added tight_layout to automatically reduce whitespace
    plt.tight_layout()
    plt.savefig(CM_HEATMAP_PATH)
    print(f"✅ Confusion matrix heatmap saved to: {CM_HEATMAP_PATH}")

if __name__ == '__main__':
    evaluate_final_model()


## Future plan: use transformers (not in scope for this submission)