### Exercise 8 - Training

reads your CSV (tab- or comma-separated),

builds paths as audio/<filename>,

extracts compact features (MFCC stats + a few spectral stats),

reduces with TruncatedSVD,

trains a simple SVM,

prints accuracy + report + confusion matrix.

In [26]:
import os
import numpy as np
import pandas as pd
from pathlib import Path

import librosa
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import joblib

In [27]:
# simple_us8k_3class_svd_svm.py
# Minimal classical ML pipeline for dog_bark / street_music / drilling in a flat "audio/" folder

# Setup
CSV_PATH   = "../UrbanSound_Filtered/filtered_metadata.csv"   # your CSV/TSV path
CSV_SEP    = ","                      # change to "\t" if your file is tab-separated
AUDIO_DIR  = "../UrbanSound_Filtered/audio/"                  # flat folder containing wav files
FILENAME_COL_CANDIDATES = ["slice_file_name", "filename", "file", "filepath", "path"]
LABEL_COL_CANDIDATES    = ["class", "label", "Class", "Label"]
TARGET_CLASSES = {"dog_bark", "street_music", "drilling"}
SR = 22050
RANDOM_STATE = 42

# Utils
def find_column(df, candidates):
    for c in candidates:
        if c in df.columns:
            return c
    raise ValueError(f"None of the expected columns found: {candidates}")

def feature_vector(y, sr):
    """Return a 1D feature vector (mean/std over time for a few robust features)."""
    # MFCCs (20) + deltas (optional to keep minimal â€” comment out if you want even simpler)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)  # (20, T)
    mfcc_mean = mfcc.mean(axis=1); mfcc_std = mfcc.std(axis=1)

    # Basic spectral features (1 x T each)
    sc   = librosa.feature.spectral_centroid(y=y, sr=sr)
    sb   = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    ro   = librosa.feature.spectral_rolloff(y=y, sr=sr)
    zcr  = librosa.feature.zero_crossing_rate(y)
    rms  = librosa.feature.rms(y=y)

    def stats(feat):  # (1, T) or (k, T) -> concat mean+std per row
        return np.hstack([feat.mean(axis=1), feat.std(axis=1)])

    dur = np.array([librosa.get_duration(y=y, sr=sr)], dtype=np.float32)

    x = np.hstack([
        mfcc_mean, mfcc_std,
        stats(sc), stats(sb), stats(ro), stats(zcr), stats(rms),
        dur
    ]).astype(np.float32)

    return x

def load_features(paths):
    X = []
    keep_idx = []
    for i, p in enumerate(paths):
        try:
            y, sr = librosa.load(p, sr=SR, mono=True)
            if y is None or len(y) < 32:  # skip tiny/broken files
                continue
            X.append(feature_vector(y, sr))
            keep_idx.append(i)
        except Exception as e:
            print(f"[WARN] failed on {p}: {e}")
    return (np.vstack(X).astype(np.float32), np.array(keep_idx, dtype=int))

In [28]:
# -------------- Main --------------
def main():
    # Read table (header assumed; set header=None and pass names=... if yours has no header)
    df = pd.read_csv(CSV_PATH, sep=CSV_SEP)

    # Find columns
    fname_col = find_column(df, FILENAME_COL_CANDIDATES)
    label_col = find_column(df, LABEL_COL_CANDIDATES)

    # Keep only the 3 target classes (safe even if already filtered)
    df[label_col] = df[label_col].astype(str).str.strip()
    df = df[df[label_col].isin(TARGET_CLASSES)].copy()

    # Build absolute paths from flat audio/ folder
    df["__path__"] = df[fname_col].apply(lambda s: str(Path(AUDIO_DIR) / str(s)))

    # Drop rows whose files don't exist
    exists_mask = df["__path__"].apply(lambda p: Path(p).exists())
    if exists_mask.sum() < len(df):
        print(f"[INFO] Skipping {len(df) - exists_mask.sum()} rows with missing files.")
        df = df[exists_mask].copy()

    # Extract features
    X, keep_idx = load_features(df["__path__"].tolist())
    y = df.iloc[keep_idx][label_col].values

    # Train/test split (stratified to get proper weightage for imbalanced classes)
    X_tr, X_te, y_tr, y_te = train_test_split(
        X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
    )

    # Pipeline: scale -> SVD -> SVM
    pipe = Pipeline([
        ("scaler", StandardScaler(with_mean=True, with_std=True)),
        ("svd", TruncatedSVD(n_components=20, random_state=RANDOM_STATE)),  # SVD-based reduction
        ("clf", SVC(kernel="rbf", C=10.0, gamma="scale", probability=False, random_state=RANDOM_STATE)),
    ])

    pipe.fit(X_tr, y_tr)
    y_pred = pipe.predict(X_te)

    print("\nAccuracy:", round(accuracy_score(y_te, y_pred), 4))
    print("\nClassification report:\n", classification_report(y_te, y_pred, digits=4))
    print("Confusion matrix:\n", confusion_matrix(y_te, y_pred, labels=sorted(np.unique(y))))

    joblib.dump(pipe, 'sound_model.pkl')
    

In [29]:
# Run
main()


Accuracy: 0.955

Classification report:
               precision    recall  f1-score   support

    dog_bark     0.9538    0.9300    0.9418       200
    drilling     0.9559    0.9750    0.9653       200
street_music     0.9552    0.9600    0.9576       200

    accuracy                         0.9550       600
   macro avg     0.9550    0.9550    0.9549       600
weighted avg     0.9550    0.9550    0.9549       600

Confusion matrix:
 [[186   8   6]
 [  2 195   3]
 [  7   1 192]]
