# Training Notebook - Ambient Sound Classification

**Final Project B - EEL5840 Fall 2025**

This notebook trains the final model used for ambient sound classification.

**Final Model**: SVM with RBF kernel trained on 100 advanced audio features
**Performance**: 92.13% weighted F1-score on test set

In [1]:
import numpy as np
import librosa
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, classification_report

print("Libraries loaded successfully")

Libraries loaded successfully


## Feature Extraction Function

In [2]:
def extract_advanced_audio_features(audio, sr=48000):
    """
    Extract 100 advanced audio features from a 5-second audio clip.
    
    Features:
    - Time-domain: RMS (2), ZCR (2)
    - Spectral: Centroid (2), Rolloff (2), Bandwidth (2)
    - MFCCs: 13 coefficients Ã— 4 statistics (52)
    - Delta-MFCCs: 13 mean values (13)
    - Chroma: 12 pitch classes (12)
    - Spectral Contrast: 7 bands (7)
    - Tonnetz: 6 tonal features (6)
    
    Total: 100 features
    """
    features = []
    
    rms = librosa.feature.rms(y=audio)[0]
    features.append(np.mean(rms))
    features.append(np.std(rms))
    
    zcr = librosa.feature.zero_crossing_rate(audio)[0]
    features.append(np.mean(zcr))
    features.append(np.std(zcr))
    
    centroid = librosa.feature.spectral_centroid(y=audio, sr=sr)[0]
    features.append(np.mean(centroid))
    features.append(np.std(centroid))
    
    rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sr)[0]
    features.append(np.mean(rolloff))
    features.append(np.std(rolloff))
    
    bandwidth = librosa.feature.spectral_bandwidth(y=audio, sr=sr)[0]
    features.append(np.mean(bandwidth))
    features.append(np.std(bandwidth))
    
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
    for coef in mfccs:
        features.append(np.mean(coef))
        features.append(np.std(coef))
        features.append(np.max(coef))
        features.append(np.min(coef))
    
    delta_mfccs = librosa.feature.delta(mfccs)
    for coef in delta_mfccs:
        features.append(np.mean(coef))
    
    chroma = librosa.feature.chroma_stft(y=audio, sr=sr)
    for coef in chroma:
        features.append(np.mean(coef))
    
    contrast = librosa.feature.spectral_contrast(y=audio, sr=sr)
    for coef in contrast:
        features.append(np.mean(coef))
    
    tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(audio), sr=sr)
    for coef in tonnetz:
        features.append(np.mean(coef))
    
    return np.array(features)

## Training Function

In [3]:
def train(data_path='training_data_projectB.npy', 
          labels_path='training_labels_projectB.npy',
          model_save_path='final_model.pkl',
          sample_rate=48000):
    """
    Train the final SVM model on provided training data.
    
    Parameters:
    -----------
    data_path : str
        Path to training data (.npy file, shape: [n_samples_per_clip, n_clips])
    labels_path : str
        Path to training labels (.npy file, shape: [n_clips])
    model_save_path : str
        Path to save trained model (.pkl file)
    sample_rate : int
        Audio sample rate (default: 48000 Hz)
    
    Returns:
    --------
    model : sklearn.pipeline.Pipeline
        Trained SVM model
    train_f1 : float
        Training set F1-score
    """
    print("=" * 60)
    print("TRAINING AMBIENT SOUND CLASSIFICATION MODEL")
    print("=" * 60)
    
    print("\n1. Loading data...")
    data_training = np.load(data_path)
    labels_training = np.load(labels_path)
    
    n_samples = data_training.shape[1]
    print(f"   Loaded {n_samples} audio clips")
    print(f"   Audio shape: {data_training.shape}")
    
    print("\n2. Extracting features from all audio clips...")
    X = []
    for i in range(n_samples):
        if i % 100 == 0:
            print(f"   Processing clip {i}/{n_samples}...")
        
        audio = data_training[:, i]
        features = extract_advanced_audio_features(audio, sample_rate)
        X.append(features)
    
    X = np.array(X)
    t = labels_training.astype(int) - 1
    
    print(f"\n   Feature extraction complete!")
    print(f"   Feature matrix shape: {X.shape}")
    print(f"   Number of features per sample: {X.shape[1]}")
    
    print("\n3. Splitting data (80/20 train/test)...")
    X_train, X_test, t_train, t_test = train_test_split(
        X, t, test_size=0.2, stratify=t, random_state=42
    )
    print(f"   Training set: {X_train.shape[0]} samples")
    print(f"   Test set: {X_test.shape[0]} samples")
    
    print("\n4. Building SVM pipeline...")
    svm_pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', SVC(kernel='rbf', random_state=42, probability=True))
    ])
    
    param_grid = {
        'classifier__C': [1, 5, 10, 50],
        'classifier__gamma': [0.0001, 0.0005, 0.001, 0.005]
    }
    
    print("\n5. Training with GridSearchCV (5-fold CV)...")
    print("   This may take several minutes...")
    grid_search = GridSearchCV(
        estimator=svm_pipeline,
        param_grid=param_grid,
        cv=5,
        scoring='f1_weighted',
        n_jobs=-1,
        verbose=1
    )
    
    grid_search.fit(X_train, t_train)
    
    print(f"\n   Best parameters: {grid_search.best_params_}")
    print(f"   Best CV F1-score: {grid_search.best_score_:.4f}")
    
    print("\n6. Evaluating on test set...")
    best_model = grid_search.best_estimator_
    
    y_train = best_model.predict(X_train)
    y_test = best_model.predict(X_test)
    
    train_f1 = f1_score(t_train, y_train, average='weighted')
    test_f1 = f1_score(t_test, y_test, average='weighted')
    
    print(f"   Training F1-score: {train_f1:.4f}")
    print(f"   Test F1-score: {test_f1:.4f}")
    
    print("\n7. Saving model...")
    with open(model_save_path, 'wb') as f:
        pickle.dump(best_model, f)
    print(f"   Model saved to: {model_save_path}")
    
    print("\n" + "=" * 60)
    print("TRAINING COMPLETE!")
    print("=" * 60)
    print(f"Final Model: SVM with C={grid_search.best_params_['classifier__C']}, "
          f"gamma={grid_search.best_params_['classifier__gamma']}")
    print(f"Test F1-score: {test_f1:.4f}")
    print("=" * 60)
    
    return best_model, train_f1

## Run Training

In [None]:
if __name__ == "__main__":
    model, train_f1 = train(
        data_path='training_data_projectB.npy',
        labels_path='training_labels_projectB.npy',
        model_save_path='final_model.pkl',
        sample_rate=48000
    )

TRAINING AMBIENT SOUND CLASSIFICATION MODEL

1. Loading data...
   Loaded 1210 audio clips
   Audio shape: (240000, 1210)

2. Extracting features from all audio clips...
   Processing clip 0/1210...
   Processing clip 100/1210...
   Processing clip 200/1210...
   Processing clip 300/1210...
   Processing clip 400/1210...


  return pitch_tuning(


   Processing clip 500/1210...
