# Week 1: Audio Classification w/ IRMAS Dataset - TinyImageNet Workflow for Audio Spectrogram Classification
## Tools Setup

In [1]:
import pathlib
import os
import sys
import operator
import re
import datetime
from functools import reduce

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import librosa
import librosa.display
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Model
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Conv2D, Flatten, Dropout, MaxPooling2D, ReLU, Softmax
from tensorflow.keras.callbacks import TensorBoard

# Enable or disable GPU
ENABLE_GPU = True
if not ENABLE_GPU:
    os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

print(f"TensorFlow version: {tf.__version__}")
print(f"GPU Available: {tf.config.list_physical_devices('GPU')}")

TensorFlow version: 2.20.0
GPU Available: []


## Section One: Dataset Acquisition & Exploration

In [2]:
"""
IRMAS Dataset Structure:
- Downloaded from: https://www.upf.edu/web/mtg/irmas
- Contains audio files of 11 instrument classes
- Training: 6705 files (3 second excerpts)
- Testing: 2874 files (variable length)

For this project, we'll subset to 5-10 instrument classes.
"""

# TODO: Download IRMAS dataset and update this path
IRMAS_PATH = os.path.expanduser("~/irmas_dataset")
TRAIN_PATH = os.path.join(IRMAS_PATH, "IRMAS-TrainingData")
TEST_PATH = os.path.join(IRMAS_PATH, "IRMAS-TestingData")

# Define instrument classes (subset to keep manageable)
# Full IRMAS classes: cel, cla, flu, gac, gel, org, pia, sax, tru, vio, voi
INSTRUMENT_CLASSES = ['cel', 'cla', 'flu', 'gac', 'gel', 'org', 'pia', 'sax', 'tru', 'vio']
NUM_CLASSES = len(INSTRUMENT_CLASSES)

print(f"\nSelected Instrument Classes ({NUM_CLASSES}):")
for i, inst in enumerate(INSTRUMENT_CLASSES):
    print(f"  {i}: {inst}")


Selected Instrument Classes (10):
  0: cel
  1: cla
  2: flu
  3: gac
  4: gel
  5: org
  6: pia
  7: sax
  8: tru
  9: vio


## Dataset Statistics Helper Functions

In [3]:
def explore_irmas_dataset(data_path, instrument_classes):
    """
    Explore IRMAS dataset structure and collect statistics
    """
    stats = {
        'instrument': [],
        'num_files': [],
        'avg_duration': [],
        'sample_rate': [],
        'total_samples': []
    }
    
    for inst in instrument_classes:
        inst_path = os.path.join(data_path, inst)
        if not os.path.exists(inst_path):
            print(f"Warning: {inst_path} not found!")
            continue
            
        audio_files = [f for f in os.listdir(inst_path) if f.endswith('.wav')]
        num_files = len(audio_files)
        
        # Sample a few files to get duration statistics
        durations = []
        sample_rates = []
        for audio_file in audio_files[:10]:  # Sample first 10 files
            file_path = os.path.join(inst_path, audio_file)
            try:
                y, sr = librosa.load(file_path, sr=None)
                durations.append(len(y) / sr)
                sample_rates.append(sr)
            except:
                continue
        
        avg_duration = np.mean(durations) if durations else 0
        common_sr = max(set(sample_rates), key=sample_rates.count) if sample_rates else 0
        
        stats['instrument'].append(inst)
        stats['num_files'].append(num_files)
        stats['avg_duration'].append(avg_duration)
        stats['sample_rate'].append(common_sr)
        stats['total_samples'].append(num_files)
    
    return pd.DataFrame(stats)

# TODO: Explore the IRMAS dataset
print("\nExploring IRMAS Training Dataset...")
# dataset_stats = explore_irmas_dataset(TRAIN_PATH, INSTRUMENT_CLASSES)
# print("\nDataset Statistics:")
# print(dataset_stats)
# print(f"\nTotal audio files: {dataset_stats['num_files'].sum()}")


Exploring IRMAS Training Dataset...


# SECTION 2: AUDIO PREPROCESSING PIPELINE (Days 3-4)

In [None]:
# Spectrogram Parameters
SAMPLE_RATE = 22050  # Standard audio sample rate
N_FFT = 2048
HOP_LENGTH = 512
N_MELS = 128  # Number of mel bands
DURATION = 3.0  # Fixed duration in seconds
TARGET_SHAPE = (128, 128, 1)  # Match CNN input expectations (height, width, channels)

def audio_to_melspectrogram(audio_path, sr=SAMPLE_RATE, n_mels=N_MELS, 
                           duration=DURATION, target_shape=TARGET_SHAPE):
    """
    Convert audio file to mel-spectrogram
    
    Args:
        audio_path: Path to audio file
        sr: Sample rate
        n_mels: Number of mel frequency bands
        duration: Target duration in seconds
        target_shape: Output shape (height, width, channels)
    
    Returns:
        Mel-spectrogram as numpy array with shape target_shape
    """
    # Load audio file
    y, sr_actual = librosa.load(audio_path, sr=sr, duration=duration)
    
    # Pad if too short
    target_length = int(sr * duration)
    if len(y) < target_length:
        y = np.pad(y, (0, target_length - len(y)), mode='constant')
    
    # Generate mel-spectrogram
    mel_spec = librosa.feature.melspectrogram(
        y=y, 
        sr=sr, 
        n_fft=N_FFT,
        hop_length=HOP_LENGTH,
        n_mels=n_mels
    )
    
    # Convert to log scale (dB)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    
    # Normalize to [0, 1]
    mel_spec_norm = (mel_spec_db - mel_spec_db.min()) / (mel_spec_db.max() - mel_spec_db.min())
    
    # Resize to target shape if needed
    if mel_spec_norm.shape != target_shape[:2]:
        from scipy.ndimage import zoom
        zoom_factors = (target_shape[0] / mel_spec_norm.shape[0], 
                       target_shape[1] / mel_spec_norm.shape[1])
        mel_spec_norm = zoom(mel_spec_norm, zoom_factors, order=1)
    
    # Add channel dimension
    mel_spec_norm = np.expand_dims(mel_spec_norm, axis=-1)
    
    return mel_spec_norm.astype(np.float32)

def visualize_spectrograms(audio_paths, labels, num_samples=5):
    """
    Visualize sample spectrograms
    """
    fig, axes = plt.subplots(2, num_samples, figsize=(15, 6))
    
    for i in range(num_samples):
        # Load audio
        y, sr = librosa.load(audio_paths[i], sr=SAMPLE_RATE, duration=DURATION)
        
        # Plot waveform
        axes[0, i].plot(y)
        axes[0, i].set_title(f"{labels[i]}\nWaveform")
        axes[0, i].set_xlabel("Sample")
        
        # Plot spectrogram
        mel_spec = audio_to_melspectrogram(audio_paths[i])
        axes[1, i].imshow(mel_spec[:, :, 0], aspect='auto', origin='lower', cmap='viridis')
        axes[1, i].set_title("Mel-Spectrogram")
        axes[1, i].set_xlabel("Time")
        axes[1, i].set_ylabel("Mel Frequency")
    
    plt.tight_layout()
    plt.savefig('sample_spectrograms.png', dpi=150, bbox_inches='tight')
    plt.show()

# TODO: Generate and visualize sample spectrograms
# Collect sample audio files
# sample_files = []
# sample_labels = []
# for inst in INSTRUMENT_CLASSES[:5]:
#     inst_path = os.path.join(TRAIN_PATH, inst)
#     files = [os.path.join(inst_path, f) for f in os.listdir(inst_path) if f.endswith('.wav')][:1]
#     sample_files.extend(files)
#     sample_labels.extend([inst] * len(files))
#
# visualize_spectrograms(sample_files, sample_labels)

# Data Generator for Training

In [None]:
def create_audio_dataset(data_path, instrument_classes, batch_size=32, 
                        validation_split=0.2, shuffle=True):
    """
    Create TensorFlow dataset from audio files
    """
    all_files = []
    all_labels = []
    
    for idx, inst in enumerate(instrument_classes):
        inst_path = os.path.join(data_path, inst)
        if not os.path.exists(inst_path):
            continue
        
        audio_files = [os.path.join(inst_path, f) 
                      for f in os.listdir(inst_path) if f.endswith('.wav')]
        all_files.extend(audio_files)
        all_labels.extend([idx] * len(audio_files))
    
    # Convert to numpy arrays
    all_files = np.array(all_files)
    all_labels = np.array(all_labels)
    
    # Shuffle
    if shuffle:
        indices = np.random.permutation(len(all_files))
        all_files = all_files[indices]
        all_labels = all_labels[indices]
    
    # Split into train and validation
    split_idx = int(len(all_files) * (1 - validation_split))
    train_files, val_files = all_files[:split_idx], all_files[split_idx:]
    train_labels, val_labels = all_labels[:split_idx], all_labels[split_idx:]
    
    print(f"Training samples: {len(train_files)}")
    print(f"Validation samples: {len(val_files)}")
    
    return (train_files, train_labels), (val_files, val_labels)

def data_generator(files, labels, batch_size=32, shuffle=True):
    """
    Generator function for loading and preprocessing audio on-the-fly
    """
    num_samples = len(files)
    indices = np.arange(num_samples)
    
    while True:
        if shuffle:
            np.random.shuffle(indices)
        
        for start_idx in range(0, num_samples, batch_size):
            batch_indices = indices[start_idx:start_idx + batch_size]
            batch_files = files[batch_indices]
            batch_labels = labels[batch_indices]
            
            # Load and preprocess audio
            batch_spectrograms = []
            for audio_file in batch_files:
                try:
                    spec = audio_to_melspectrogram(audio_file)
                    batch_spectrograms.append(spec)
                except Exception as e:
                    # Skip problematic files
                    print(f"Error loading {audio_file}: {e}")
                    continue
            
            if not batch_spectrograms:
                continue
            
            X = np.array(batch_spectrograms)
            y = tf.keras.utils.to_categorical(
                batch_labels[:len(batch_spectrograms)], 
                num_classes=NUM_CLASSES
            )
            
            yield X, y

# TODO: Create train/validation splits
# (train_files, train_labels), (val_files, val_labels) = create_audio_dataset(
#     TRAIN_PATH, 
#     INSTRUMENT_CLASSES,
#     validation_split=0.2
# )

# SECTION 3: MODEL ARCHITECTURE (Days 5-7)

In [None]:
def create_audio_cnn(input_shape=(128, 128, 1), num_classes=NUM_CLASSES):
    """
    CNN architecture adapted from TinyImageNet model for audio spectrograms
    
    Original architecture adapted for single-channel spectrogram input
    """
    model = Sequential(name='AudioCNN_IRMAS')
    
    # First conv block
    model.add(Conv2D(32, (5, 5), input_shape=input_shape, activation='relu', name='conv1_1'))
    model.add(Conv2D(32, (5, 5), activation='relu', name='conv1_2'))
    model.add(MaxPooling2D(pool_size=(2, 2), name='pool1'))
    
    # Second conv block
    model.add(Conv2D(64, (3, 3), activation='relu', name='conv2_1'))
    model.add(Conv2D(64, (3, 3), activation='relu', name='conv2_2'))
    model.add(MaxPooling2D(pool_size=(2, 2), name='pool2'))
    
    # Third conv block
    model.add(Conv2D(64, (3, 3), activation='relu', name='conv3_1'))
    model.add(Conv2D(128, (3, 3), activation='relu', name='conv3_2'))
    model.add(MaxPooling2D(pool_size=(2, 2), name='pool3'))
    
    # Flatten and dense layers
    model.add(Flatten(name='flatten'))
    model.add(Dense(256, activation='relu', name='fc1'))
    model.add(Dropout(0.5, name='dropout'))  # Added dropout for regularization
    model.add(Dense(num_classes, activation='softmax', name='fc2'))
    
    return model

# Create model
audio_model = create_audio_cnn()
audio_model.summary()

# Compile model
audio_model.compile(
    loss='categorical_crossentropy',
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    metrics=['accuracy', 'top_k_categorical_accuracy']
)

# SECTION 4: MODEL TRAINING (Days 5-7)

In [None]:
# Training parameters
BATCH_SIZE = 32
EPOCHS = 30
STEPS_PER_EPOCH = 100  # Adjust based on dataset size
VALIDATION_STEPS = 20

# Setup TensorBoard logging
log_dir = os.path.join("logs", "fit", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = TensorBoard(
    log_dir=log_dir,
    histogram_freq=1,
    profile_batch='10,20'
)

# TODO: Train the model
# Note: Uncomment and run this section when ready to train
"""
train_gen = data_generator(train_files, train_labels, batch_size=BATCH_SIZE, shuffle=True)
val_gen = data_generator(val_files, val_labels, batch_size=BATCH_SIZE, shuffle=False)

history = audio_model.fit(
    train_gen,
    steps_per_epoch=STEPS_PER_EPOCH,
    epochs=EPOCHS,
    validation_data=val_gen,
    validation_steps=VALIDATION_STEPS,
    callbacks=[tensorboard_callback],
    verbose=1
)

# Save the trained model
audio_model.save('audio_classifier_irmas.h5')
print("Model saved as 'audio_classifier_irmas.h5'")
"""


# SECTION 5: TRAINING VISUALIZATION & ANALYSIS

In [None]:
def plot_training_history(history):
    """
    Plot training and validation accuracy/loss curves
    """
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    # Plot accuracy
    axes[0].plot(history.history['accuracy'], label='Train Accuracy')
    axes[0].plot(history.history['val_accuracy'], label='Val Accuracy')
    axes[0].set_title('Model Accuracy')
    axes[0].set_xlabel('Epoch')
    axes[0].set_ylabel('Accuracy')
    axes[0].legend()
    axes[0].grid(True)
    
    # Plot loss
    axes[1].plot(history.history['loss'], label='Train Loss')
    axes[1].plot(history.history['val_loss'], label='Val Loss')
    axes[1].set_title('Model Loss')
    axes[1].set_xlabel('Epoch')
    axes[1].set_ylabel('Loss')
    axes[1].legend()
    axes[1].grid(True)
    
    plt.tight_layout()
    plt.savefig('training_curves.png', dpi=150, bbox_inches='tight')
    plt.show()

# TODO: Plot training curves after training
# plot_training_history(history)


# SECTION 6: MODEL EVALUATION & INFERENCE

In [None]:
def evaluate_model(model, test_files, test_labels, batch_size=32):
    """
    Evaluate model on test set
    """
    test_gen = data_generator(test_files, test_labels, batch_size=batch_size, shuffle=False)
    steps = len(test_files) // batch_size
    
    results = model.evaluate(test_gen, steps=steps, verbose=1)
    
    print(f"\nTest Loss: {results[0]:.4f}")
    print(f"Test Accuracy: {results[1]:.4f}")
    print(f"Test Top-K Accuracy: {results[2]:.4f}")
    
    return results

def predict_instrument(model, audio_path, instrument_classes):
    """
    Predict instrument class for a single audio file
    """
    # Preprocess audio
    spec = audio_to_melspectrogram(audio_path)
    spec_batch = np.expand_dims(spec, axis=0)
    
    # Predict
    predictions = model.predict(spec_batch, verbose=0)
    predicted_class = np.argmax(predictions[0])
    confidence = predictions[0][predicted_class]
    
    print(f"\nPrediction for: {os.path.basename(audio_path)}")
    print(f"Predicted Instrument: {instrument_classes[predicted_class]}")
    print(f"Confidence: {confidence:.4f}")
    
    # Show top 5 predictions
    top_5_idx = np.argsort(predictions[0])[-5:][::-1]
    print("\nTop 5 Predictions:")
    for idx in top_5_idx:
        print(f"  {instrument_classes[idx]}: {predictions[0][idx]:.4f}")
    
    return predicted_class, confidence

# TODO: Run inference on sample audio files
# Load trained model
# audio_model = load_model('audio_classifier_irmas.h5')

# Test on a few samples
# for i in range(3):
#     sample_file = val_files[i]
#     true_label = val_labels[i]
#     print(f"\nTrue Label: {INSTRUMENT_CLASSES[true_label]}")
#     predict_instrument(audio_model, sample_file, INSTRUMENT_CLASSES)


# SECTION 7: EXPORT MODEL WEIGHTS & INTERMEDIATE OUTPUTS

In [None]:
def export_model_weights(model, output_dir='model_weights'):
    """
    Export model weights and biases as binary files for C++ implementation
    """
    os.makedirs(output_dir, exist_ok=True)
    
    for layer in model.layers:
        if len(layer.get_weights()) > 0:
            weights = layer.get_weights()
            layer_name = layer.name
            
            # Export weights
            if len(weights) > 0:
                weight_array = weights[0]
                weight_file = os.path.join(output_dir, f'{layer_name}_weights.bin')
                weight_array.astype(np.float32).tofile(weight_file)
                print(f"Exported: {weight_file} | Shape: {weight_array.shape}")
            
            # Export biases
            if len(weights) > 1:
                bias_array = weights[1]
                bias_file = os.path.join(output_dir, f'{layer_name}_bias.bin')
                bias_array.astype(np.float32).tofile(bias_file)
                print(f"Exported: {bias_file} | Shape: {bias_array.shape}")

def export_intermediate_features(model, audio_path, output_dir='feature_maps'):
    """
    Export intermediate feature maps for validation
    """
    os.makedirs(output_dir, exist_ok=True)
    
    # Create intermediate models for each layer
    spec = audio_to_melspectrogram(audio_path)
    spec_batch = np.expand_dims(spec, axis=0)
    
    for i, layer in enumerate(model.layers):
        intermediate_model = Model(inputs=model.input, outputs=layer.output)
        features = intermediate_model.predict(spec_batch, verbose=0)
        
        # Save features
        feature_file = os.path.join(output_dir, f'layer_{i}_{layer.name}_features.bin')
        features.astype(np.float32).tofile(feature_file)
        print(f"Exported: {feature_file} | Shape: {features.shape}")

# TODO: Export weights and features after training
# export_model_weights(audio_model)
# export_intermediate_features(audio_model, val_files[0])

# DELIVERABLES CHECKLIST

In [None]:
print("\n" + "="*70)
print("WEEK 1 DELIVERABLES CHECKLIST")
print("="*70)
print("[ ] Dataset downloaded and explored")
print("[ ] Audio-to-spectrogram pipeline implemented")
print("[ ] Train/validation/test splits created")
print("[ ] Sample spectrograms visualized")
print("[ ] CNN model architecture adapted for audio")
print("[ ] Model trained on GPU VM")
print("[ ] Training curves documented")
print("[ ] Validation accuracy >= 70-80%")
print("[ ] Model saved as .h5 file")
print("[ ] Model weights exported as binary files")
print("[ ] TensorBoard profiling completed")
print("="*70)

# To launch TensorBoard:
# tensorboard --logdir=logs/fit