In [1]:
# Install required packages
!pip install --quiet numpy pandas matplotlib scikit-learn torch torchvision torchaudio pytorch-lightning wandb rich ipywidgets tabulate tqdm

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m75.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m54.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m43.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import os
import pandas as pd
import numpy as np
from tabulate import tabulate
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    balanced_accuracy_score,
    precision_score,
    recall_score,
    f1_score
)
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import pytorch_lightning as pl
from pytorch_lightning.callbacks import (
    ModelCheckpoint,
    EarlyStopping,
    LearningRateMonitor,
    RichProgressBar
)
from pytorch_lightning.loggers import WandbLogger
from tqdm import tqdm
from huggingface_hub import snapshot_download, hf_hub_download
import zipfile
import shutil

In [3]:
# download the compute_cost.py file
pyfile_path = hf_hub_download(
    repo_id="fschmid56/mlpc2025_dataset",
    filename="compute_cost.py",
    repo_type="dataset"
)

# move to current working directory (/content)
shutil.copy(pyfile_path, os.getcwd() + "/compute_cost.py")

# import required functions
from compute_cost import CLASSES as TARGET_CLASSES
from compute_cost import (
    aggregate_targets,
    get_ground_truth_df,
    get_segment_prediction_df,
    check_dataframe,
    total_cost
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


compute_cost.py:   0%|          | 0.00/9.95k [00:00<?, ?B/s]

In [4]:
# Step 1: Download the ZIP file from HF Hub
zip_path = hf_hub_download(
    repo_id="fschmid56/mlpc2025_dataset",   # your dataset repo
    filename="mlpc2025_dataset.zip",        # your uploaded ZIP file
    repo_type="dataset"                     # specify that it's a dataset repo
)

print(f"✅ ZIP downloaded: {zip_path}")

mlpc2025_dataset.zip:   0%|          | 0.00/8.71G [00:00<?, ?B/s]

✅ ZIP downloaded: /root/.cache/huggingface/hub/datasets--fschmid56--mlpc2025_dataset/snapshots/5ecbfd8531c18fbb4fa60b79eacdf585b1f1aac4/mlpc2025_dataset.zip


In [5]:
# Step 2: Extract the ZIP
extract_path = "/content/mlpc2025_dataset"
os.makedirs(extract_path, exist_ok=True)

# Check if already extracted
if not os.path.exists(os.path.join(extract_path, "data")):  # assuming 'data/' is inside the zip
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
    print(f"✅ Dataset extracted to {extract_path}")
else:
    print(f"✅ Dataset already extracted at {extract_path}")

✅ Dataset extracted to /content/mlpc2025_dataset


In [6]:
# Step 3: Set your DATASET_PATH
DATASET_PATH = os.path.join(extract_path, "data")  # because you zipped the 'data' folder
print(f"✅ DATASET_PATH set to {DATASET_PATH}")

# Quick check
print("Files in DATASET_PATH:", os.listdir(DATASET_PATH))

✅ DATASET_PATH set to /content/mlpc2025_dataset/data
Files in DATASET_PATH: ['metadata.csv', '.cache', 'audio_features', 'customer_test_data', 'labels', 'annotations.csv', 'audio']


In [7]:
METADATA_CSV = os.path.join(DATASET_PATH, 'metadata.csv')
ANNOTATIONS_CSV = os.path.join(DATASET_PATH, 'annotations.csv')
AUDIO_DIR = os.path.join(DATASET_PATH, 'audio')
AUDIO_FEATURES_DIR = os.path.join(DATASET_PATH, 'audio_features')
LABELS_DIR = os.path.join(DATASET_PATH, 'labels')

METADATA = pd.read_csv(METADATA_CSV)
DEV_SET_FILES = METADATA['filename']

CUSTOMER_DATASET_PATH = os.path.join(DATASET_PATH, 'customer_test_data')
CUSTOMER_AUDIO_DIR = os.path.join(CUSTOMER_DATASET_PATH, 'audio')
CUSTOMER_AUDIO_FEATURES_DIR = os.path.join(CUSTOMER_DATASET_PATH, 'audio_features')
CUSTOMER_METADATA_CSV = os.path.join(CUSTOMER_DATASET_PATH, 'metadata.csv')
CUSTOMER_METADATA = pd.read_csv(CUSTOMER_METADATA_CSV)

DATA_SUBSAMPLE = 3000  # works with available RAM in Colab

In [8]:
def read_files(file_names, classes, features_dir=AUDIO_FEATURES_DIR, labels_dir=LABELS_DIR):
    """
    Loads features and binary labels for a list of files.

    Returns:
        X: list of np.ndarrays, each of shape (num_frames, num_features)
        Y: dict of lists of np.ndarrays, each of shape (num_frames,)
    """
    X = []
    Y = {c: [] for c in classes} if labels_dir is not None else None

    for fname in file_names:
        base = os.path.splitext(fname)[0]

        # Load features
        feat_path = os.path.join(features_dir, base + '.npz')
        features = np.load(feat_path)['embeddings']  # shape: (T, D)
        X.append(features)

        if labels_dir is not None:
            # Load labels
            label_path = os.path.join(labels_dir, base + '_labels.npz')
            labels = np.load(label_path)

            for c in classes:
                label_array = labels[c]  # shape: (T, num_annotators)
                binary_labels = (np.max(label_array, axis=1) > 0).astype(int)
                Y[c].append(binary_labels)  # shape: (T,)

    return X, Y

In [17]:
# Get filenames for split based on filenames
all_files = DEV_SET_FILES.unique()

# First split: 60% train, 40% temp (val + test)
train_files, temp_files = train_test_split(
    all_files, test_size=0.4, random_state=42, shuffle=True
)

# Second split: 50% val, 50% test from the remaining 40%
val_files, test_files = train_test_split(
    temp_files, test_size=0.5, random_state=42, shuffle=True
)

train_files = train_files[:DATA_SUBSAMPLE]

print(f"Train: {len(train_files)}, Val: {len(val_files)}, Test: {len(test_files)}")

# Load features and labels
X_train, Y_train = read_files(train_files, TARGET_CLASSES)
X_val, Y_val = read_files(val_files, TARGET_CLASSES)
X_test, Y_test = read_files(test_files, TARGET_CLASSES)

Train: 3000, Val: 1646, Test: 1646


In [11]:
# Flatten: Each frame is a sample
def flatten_for_framewise_classification(X, Y_class):
    X_flat = np.concatenate(X)  # shape: (total_frames, num_features)
    Y_flat = np.concatenate(Y_class)  # shape: (total_frames,)
    return X_flat, Y_flat

In [12]:
def evaluate_classifiers(
    classes: list[str],
    Y_val: dict[str, list[np.ndarray]],
    X_val: list[np.ndarray] = None,
    inference_funcs: dict[str, callable] = None,
    Y_pred: dict[str, list[np.ndarray]] = None
) -> tuple[dict[str, list[np.ndarray]], dict[str, dict]]:
    """
    Evaluates per-frame binary classifiers and computes metrics per class.
    Uses either computed predictions or given inference functions.

    Args:
        classes: List of class names to evaluate.
        Y_val: Dict mapping class names to lists of ground-truth (T,) binary arrays.
        X_val: List of input feature arrays, one per validation file. Required if Y_pred not given.
        inference_funcs: Dict mapping class names to binary inference functions.
        Y_pred: Dict with precomputed predictions (same format as Y_val).

    Returns:
        metrics: Dict[class → {'balanced_accuracy', 'precision', 'recall', 'f1'}].
    """

    if Y_pred is None:
        assert inference_funcs is not None and X_val is not None, "If 'Y_pred' is not given, 'inference_funcs' \
                                                                    and 'X_val' must be given."

    Y_val_preds = {}
    metrics     = {}

    for cls in classes:
        # use predictions if given, else infer
        if Y_pred and cls in Y_pred:
            preds_per_file = Y_pred[cls]
        else:
            infer = inference_funcs[cls]
            preds_per_file = [infer(x_file) for x_file in X_val]
        Y_val_preds[cls] = preds_per_file

        # flatten to compute metrics
        y_true = np.concatenate(Y_val[cls])
        y_pred = np.concatenate(preds_per_file)

        metrics[cls] = {
            "balanced_accuracy": balanced_accuracy_score(y_true, y_pred),
            "precision":         precision_score(y_true, y_pred, zero_division=0),
            "recall":            recall_score(y_true, y_pred, zero_division=0),
            "f1":                f1_score(y_true, y_pred, zero_division=0),
        }

    return metrics

In [13]:
def evaluate_cost(
    val_files: list[str],
    dataset_path: str,
    classes: list[str],
    X_val: list[np.ndarray] = None,
    inference_funcs: dict[str, callable] = None,
    Y_pred: dict[str, list[np.ndarray]] = None
):
    """
    Computes segment-level cost based on predictions and ground truth.
    Uses either computed predictions or given inference functions.

    Args:
        val_files: List of filenames corresponding to X_val.
        dataset_path: Path to dataset root (used for loading ground truth).
        classes: List of class names to evaluate.
        X_val: List of input feature arrays, one per validation file. Required if Y_pred not given.
        inference_funcs: Dict mapping class names to binary inference functions.
        Y_pred: Dict with precomputed predictions (class → list of (T,) arrays).

    Returns:
        total: Total cost across all validation files.
        breakdown: Dict[class → segment-level cost].
    """

    if Y_pred is None:
        assert inference_funcs is not None and X_val is not None, "If 'Y_pred' is not given, 'inference_funcs' \
                                                                    and 'X_val' must be given."

    # 0) frame-wise predictions (per class)
    if Y_pred is None:
        Y_pred = {
            cls: [infer(x_file) for x_file in X_val]
            for cls, infer in inference_funcs.items()
        }

    # 1) restructure to filename -> class -> (T,) array
    preds_by_file = {}
    for i, fname in enumerate(val_files):
        preds_by_file[fname] = {
            cls: Y_pred[cls][i] for cls in classes
        }

    # 2) segment-level aggregation using compute_cost
    pred_df = get_segment_prediction_df(
        predictions=preds_by_file,
        class_names=classes
    )

    # 3) load & aggregate ground truth using compute_cost
    gt_df = get_ground_truth_df(val_files, dataset_path)

    # 4) sanity checks from compute_cost
    check_dataframe(pred_df, dataset_path)
    check_dataframe(gt_df, dataset_path)

    # 5) compute cost
    total, breakdown = total_cost(pred_df, gt_df)

    return total, breakdown

In [15]:
def majority_vote_baseline(Y_train_dict, classes, threshold=0.5):
    """
    Creates a naive baseline based on majority class frequency per label.

    Returns:
        inference_funcs: Dict[class → function that returns fixed prediction]
    """
    inference_funcs = {}

    for cls in classes:
        y_flat = np.concatenate(Y_train_dict[cls])
        freq = y_flat.mean()  # Fraction of positives

        print(f"Class: {cls}, Pos. Freq: {freq:.4f}")

        # Always predict 1 if positive frequency exceeds threshold, else 0
        prediction = 1 if freq > threshold else 0
        inference_funcs[cls] = lambda x_seq, val=prediction: np.full(x_seq.shape[0], val, dtype=int)

    return inference_funcs

inference_funcs = majority_vote_baseline(Y_train, TARGET_CLASSES)

Class: Speech, Pos. Freq: 0.0862
Class: Shout, Pos. Freq: 0.0173
Class: Chainsaw, Pos. Freq: 0.0092
Class: Jackhammer, Pos. Freq: 0.0081
Class: Lawn Mower, Pos. Freq: 0.0086
Class: Power Drill, Pos. Freq: 0.0226
Class: Dog Bark, Pos. Freq: 0.0202
Class: Rooster Crow, Pos. Freq: 0.0023
Class: Horn Honk, Pos. Freq: 0.0149
Class: Siren, Pos. Freq: 0.0253


In [18]:
# metrics for most-frequent label baseline
val_metrics = evaluate_classifiers(
    classes=TARGET_CLASSES,
    X_val=X_val,
    Y_val=Y_val,
    inference_funcs=inference_funcs
)

df = pd.DataFrame(val_metrics).T.round(3)
df.columns = ["BAcc", "Precision", "Recall", "F1"]
print(tabulate(df, headers='keys', tablefmt='github'))

|              |   BAcc |   Precision |   Recall |   F1 |
|--------------|--------|-------------|----------|------|
| Speech       |    0.5 |           0 |        0 |    0 |
| Shout        |    0.5 |           0 |        0 |    0 |
| Chainsaw     |    0.5 |           0 |        0 |    0 |
| Jackhammer   |    0.5 |           0 |        0 |    0 |
| Lawn Mower   |    0.5 |           0 |        0 |    0 |
| Power Drill  |    0.5 |           0 |        0 |    0 |
| Dog Bark     |    0.5 |           0 |        0 |    0 |
| Rooster Crow |    0.5 |           0 |        0 |    0 |
| Horn Honk    |    0.5 |           0 |        0 |    0 |
| Siren        |    0.5 |           0 |        0 |    0 |


In [19]:
# cost for most-frequent label baseline
total, breakdown = evaluate_cost(
    val_files=val_files,
    dataset_path=DATASET_PATH,
    classes=TARGET_CLASSES,
    X_val=X_val,
    inference_funcs=inference_funcs
)

df = pd.DataFrame({cls: {"Avg. Cost per minute": round(m["cost"], 4)} for cls, m in breakdown.items()}).T
print(f"Total average cost per minute: {total:.4f}\n")
print(tabulate(df, headers="keys", tablefmt="github"))

Total average cost per minute: 108.8553

|              |   Avg. Cost per minute |
|--------------|------------------------|
| Speech       |                24.8092 |
| Shout        |                 9.4118 |
| Chainsaw     |                 6.6057 |
| Jackhammer   |                 7.4642 |
| Lawn Mower   |                 7.7266 |
| Power Drill  |                14.2369 |
| Dog Bark     |                 7.0986 |
| Rooster Crow |                 0.3816 |
| Horn Honk    |                12.7107 |
| Siren        |                18.4102 |
