In [61]:
# Install required packages
!pip install --quiet numpy pandas matplotlib scikit-learn torch torchvision torchaudio pytorch-lightning wandb rich ipywidgets tabulate tqdm

In [62]:
import os
import pandas as pd
import numpy as np
from tabulate import tabulate
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    balanced_accuracy_score,
    precision_score,
    recall_score,
    f1_score
)
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import pytorch_lightning as pl
from pytorch_lightning.callbacks import (
    ModelCheckpoint,
    EarlyStopping,
    LearningRateMonitor,
    RichProgressBar
)
from pytorch_lightning.loggers import WandbLogger
from tqdm import tqdm
from huggingface_hub import snapshot_download, hf_hub_download
import zipfile
import shutil

In [63]:
# download the compute_cost.py file
pyfile_path = hf_hub_download(
    repo_id="fschmid56/mlpc2025_dataset",
    filename="compute_cost.py",
    repo_type="dataset"
)

# move to current working directory (/content)
shutil.copy(pyfile_path, os.getcwd() + "/compute_cost.py")

# import required functions
from compute_cost import CLASSES as TARGET_CLASSES
from compute_cost import (
    aggregate_targets,
    get_ground_truth_df,
    get_segment_prediction_df,
    check_dataframe,
    total_cost
)

In [64]:
# Step 1: Download the ZIP file from HF Hub
zip_path = hf_hub_download(
    repo_id="fschmid56/mlpc2025_dataset",   # your dataset repo
    filename="mlpc2025_dataset.zip",        # your uploaded ZIP file
    repo_type="dataset"                     # specify that it's a dataset repo
)

print(f"✅ ZIP downloaded: {zip_path}")

✅ ZIP downloaded: /root/.cache/huggingface/hub/datasets--fschmid56--mlpc2025_dataset/snapshots/5ecbfd8531c18fbb4fa60b79eacdf585b1f1aac4/mlpc2025_dataset.zip


In [65]:
# Step 2: Extract the ZIP
extract_path = "/content/mlpc2025_dataset"
os.makedirs(extract_path, exist_ok=True)

# Check if already extracted
if not os.path.exists(os.path.join(extract_path, "data")):  # assuming 'data/' is inside the zip
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
    print(f"✅ Dataset extracted to {extract_path}")
else:
    print(f"✅ Dataset already extracted at {extract_path}")

✅ Dataset already extracted at /content/mlpc2025_dataset


In [66]:
# Step 3: Set your DATASET_PATH
DATASET_PATH = os.path.join(extract_path, "data")  # because you zipped the 'data' folder
print(f"✅ DATASET_PATH set to {DATASET_PATH}")

# Quick check
print("Files in DATASET_PATH:", os.listdir(DATASET_PATH))

✅ DATASET_PATH set to /content/mlpc2025_dataset/data
Files in DATASET_PATH: ['metadata.csv', '.cache', 'audio_features', 'customer_test_data', 'labels', 'annotations.csv', 'audio']


In [67]:
METADATA_CSV = os.path.join(DATASET_PATH, 'metadata.csv')
ANNOTATIONS_CSV = os.path.join(DATASET_PATH, 'annotations.csv')
AUDIO_DIR = os.path.join(DATASET_PATH, 'audio')
AUDIO_FEATURES_DIR = os.path.join(DATASET_PATH, 'audio_features')
LABELS_DIR = os.path.join(DATASET_PATH, 'labels')

METADATA = pd.read_csv(METADATA_CSV)
DEV_SET_FILES = METADATA['filename']

CUSTOMER_DATASET_PATH = os.path.join(DATASET_PATH, 'customer_test_data')
CUSTOMER_AUDIO_DIR = os.path.join(CUSTOMER_DATASET_PATH, 'audio')
CUSTOMER_AUDIO_FEATURES_DIR = os.path.join(CUSTOMER_DATASET_PATH, 'audio_features')
CUSTOMER_METADATA_CSV = os.path.join(CUSTOMER_DATASET_PATH, 'metadata.csv')
CUSTOMER_METADATA = pd.read_csv(CUSTOMER_METADATA_CSV)

In [68]:
def read_files(file_names, classes, features_dir=AUDIO_FEATURES_DIR, labels_dir=LABELS_DIR):
    """
    Loads features and binary labels for a list of files.

    Returns:
        X: list of np.ndarrays, each of shape (num_frames, num_features)
        Y: dict of lists of np.ndarrays, each of shape (num_frames,)
    """
    X = []
    Y = {c: [] for c in classes} if labels_dir is not None else None

    for fname in file_names:
        base = os.path.splitext(fname)[0]

        # Load features
        feat_path = os.path.join(features_dir, base + '.npz')
        features = np.load(feat_path)['embeddings']  # shape: (T, D)
        X.append(features)

        if labels_dir is not None:
            # Load labels
            label_path = os.path.join(labels_dir, base + '_labels.npz')
            labels = np.load(label_path)

            for c in classes:
                label_array = labels[c]  # shape: (T, num_annotators)
                binary_labels = (np.max(label_array, axis=1) > 0).astype(int)
                Y[c].append(binary_labels)  # shape: (T,)

    return X, Y

In [69]:
# Get filenames for split based on filenames
all_files = DEV_SET_FILES.unique()

# First split: 60% train, 40% temp (val + test)
train_files, temp_files = train_test_split(
    all_files, test_size=0.4, random_state=42, shuffle=True
)

# Second split: 50% val, 50% test from the remaining 40%
val_files, test_files = train_test_split(
    temp_files, test_size=0.5, random_state=42, shuffle=True
)


print(f"Train: {len(train_files)}, Val: {len(val_files)}, Test: {len(test_files)}")

# Load features and labels
X_train, Y_train = read_files(train_files, TARGET_CLASSES)
X_val, Y_val = read_files(val_files, TARGET_CLASSES)
X_test, Y_test = read_files(test_files, TARGET_CLASSES)

Train: 4938, Val: 1646, Test: 1646


In [70]:
class_list = ['Speech','Dog Bark','Rooster Crow','Shout','Lawn Mower','Chainsaw','Jackhammer','Power Drill','Horn Honk','Siren']

FP = [1,1,2,3,3,3,3,3,3,3]
FN = [5,5,10,10,20,15,20,15,20,15] # increased lawn mover and jackhammer cost since their loss hasnot been decreasing
pos_weights = np.array(FN)/np.array(FP )

#pos_weights=[1,1,1,2,4,3,4,3,3,3]


In [71]:
def evaluate_classifiers(
    classes: list[str],
    Y_val: dict[str, list[np.ndarray]],
    X_val: list[np.ndarray] = None,
    inference_funcs: dict[str, callable] = None,
    Y_pred: dict[str, list[np.ndarray]] = None
) -> tuple[dict[str, list[np.ndarray]], dict[str, dict]]:
    """
    Evaluates per-frame binary classifiers and computes metrics per class.
    Uses either computed predictions or given inference functions.

    Args:
        classes: List of class names to evaluate.
        Y_val: Dict mapping class names to lists of ground-truth (T,) binary arrays.
        X_val: List of input feature arrays, one per validation file. Required if Y_pred not given.
        inference_funcs: Dict mapping class names to binary inference functions.
        Y_pred: Dict with precomputed predictions (same format as Y_val).

    Returns:
        metrics: Dict[class → {'balanced_accuracy', 'precision', 'recall', 'f1'}].
    """

    if Y_pred is None:
        assert inference_funcs is not None and X_val is not None, "If 'Y_pred' is not given, 'inference_funcs' \
                                                                    and 'X_val' must be given."

    Y_val_preds = {}
    metrics     = {}

    for cls in classes:
        # use predictions if given, else infer
        if Y_pred and cls in Y_pred:
            preds_per_file = Y_pred[cls]
        else:
            infer = inference_funcs[cls]
            preds_per_file = [infer(x_file) for x_file in X_val]
        Y_val_preds[cls] = preds_per_file

        # flatten to compute metrics
        y_true = np.concatenate(Y_val[cls])
        y_pred = np.concatenate(preds_per_file)

        metrics[cls] = {
            "balanced_accuracy": balanced_accuracy_score(y_true, y_pred),
            "precision":         precision_score(y_true, y_pred, zero_division=0),
            "recall":            recall_score(y_true, y_pred, zero_division=0),
            "f1":                f1_score(y_true, y_pred, zero_division=0),
        }

    return metrics

In [72]:
def evaluate_cost(
    val_files: list[str],
    dataset_path: str,
    classes: list[str],
    X_val: list[np.ndarray] = None,
    inference_funcs: dict[str, callable] = None,
    Y_pred: dict[str, list[np.ndarray]] = None
):
    """
    Computes segment-level cost based on predictions and ground truth.
    Uses either computed predictions or given inference functions.

    Args:
        val_files: List of filenames corresponding to X_val.
        dataset_path: Path to dataset root (used for loading ground truth).
        classes: List of class names to evaluate.
        X_val: List of input feature arrays, one per validation file. Required if Y_pred not given.
        inference_funcs: Dict mapping class names to binary inference functions.
        Y_pred: Dict with precomputed predictions (class → list of (T,) arrays).

    Returns:
        total: Total cost across all validation files.
        breakdown: Dict[class → segment-level cost].
    """

    if Y_pred is None:
        assert inference_funcs is not None and X_val is not None, "If 'Y_pred' is not given, 'inference_funcs' \
                                                                    and 'X_val' must be given."

    # 0) frame-wise predictions (per class)
    if Y_pred is None:
        Y_pred = {
            cls: [infer(x_file) for x_file in X_val]
            for cls, infer in inference_funcs.items()
        }

    # 1) restructure to filename -> class -> (T,) array
    preds_by_file = {}
    for i, fname in enumerate(val_files):
        preds_by_file[fname] = {
            cls: Y_pred[cls][i] for cls in classes
        }

    # 2) segment-level aggregation using compute_cost
    pred_df = get_segment_prediction_df(
        predictions=preds_by_file,
        class_names=classes
    )

    # 3) load & aggregate ground truth using compute_cost
    gt_df = get_ground_truth_df(val_files, dataset_path)

    # 4) sanity checks from compute_cost
    check_dataframe(pred_df, dataset_path)
    check_dataframe(gt_df, dataset_path)

    # 5) compute cost
    total, breakdown = total_cost(pred_df, gt_df)

    return total, breakdown

In [73]:
class SequenceDataset(Dataset):
    def __init__(self, X, Y, classes, filenames, apply_log_mel=False, eps=1e-6,
                 mean=None, std=None):
        self.X = X
        self.Y = Y
        self.classes = classes
        self.filenames = filenames
        self.apply_log_mel = apply_log_mel
        self.eps = eps
        self.mean = mean
        self.std = std

    def __len__(self):
        return len(self.filenames)

    def __getitem__(self, idx):
        x_tensor = torch.tensor(self.X[idx], dtype=torch.float32)
        if self.apply_log_mel:
            x_tensor = torch.log10(x_tensor + self.eps)



        if self.Y is not None:
            y_tensor = torch.stack([
                torch.tensor(self.Y[c][idx], dtype=torch.long) for c in self.classes
            ], dim=1)
            return x_tensor, y_tensor, self.filenames[idx]
        else:
            return x_tensor, self.filenames[idx]


In [74]:
# collate_fn used to create batches from the individual dataset items
def collate_fn(batch):
    if len(batch[0]) == 3:
        Xs, Ys, filenames = zip(*batch)
        lengths = torch.tensor([x.size(0) for x in Xs], dtype=torch.long)
        X_padded = pad_sequence(Xs, batch_first=True)
        Y_padded = pad_sequence(Ys, batch_first=True)
        return X_padded, Y_padded, lengths, list(filenames)
    elif len(batch[0]) == 2:
        Xs, filenames = zip(*batch)
        lengths = torch.tensor([x.size(0) for x in Xs], dtype=torch.long)
        X_padded = pad_sequence(Xs, batch_first=True)
        return X_padded, lengths, list(filenames)
    else:
        raise ValueError("Unexpected batch format: expected 2 or 3 elements per item.")

In [75]:
import torch

def compute_dataset_mean_std(X, eps=1e-6):
    """
    Compute global mean and std across all time steps and samples.

    Args:
        X: list of np.ndarray or torch.Tensor, each of shape (T_i, D)

    Returns:
        mean: torch.Tensor of shape (D,)
        std: torch.Tensor of shape (D,)
    """
    if isinstance(X[0], np.ndarray):
        X = [torch.tensor(x, dtype=torch.float32) for x in X]

    # Concatenate along time dimension
    all_data = torch.cat(X, dim=0)  # shape: (sum(T_i), D)

    mean = all_data.mean(dim=0)     # shape: (D,)
    std = all_data.std(dim=0) + eps  # shape: (D,)
    return mean, std


In [76]:
mu,std =compute_dataset_mean_std(X_train)

In [77]:
ds = SequenceDataset(X_train, Y_train, TARGET_CLASSES, train_files, apply_log_mel=True, eps=1e-6)

In [78]:
# collate_fn used to create batches from the individual dataset items
def collate_fn(batch):
    if len(batch[0]) == 3:
        Xs, Ys, filenames = zip(*batch)
        lengths = torch.tensor([x.size(0) for x in Xs], dtype=torch.long)
        X_padded = pad_sequence(Xs, batch_first=True)
        Y_padded = pad_sequence(Ys, batch_first=True)
        return X_padded, Y_padded, lengths, list(filenames)
    elif len(batch[0]) == 2:
        Xs, filenames = zip(*batch)
        lengths = torch.tensor([x.size(0) for x in Xs], dtype=torch.long)
        X_padded = pad_sequence(Xs, batch_first=True)
        return X_padded, lengths, list(filenames)
    else:
        raise ValueError("Unexpected batch format: expected 2 or 3 elements per item.")

In [79]:
batch = [ds[i] for i in range(32)]
X_pad, Y_pad, lengths, filenames = collate_fn(batch)

In [80]:
# DataModule is used by pytorch lightning
class SEDDataModule(pl.LightningDataModule):
    def __init__(self,
                 X_train, Y_train, train_files,
                 X_val,   Y_val,   val_files,
                 X_test,  Y_test,  test_files,
                 classes,
                 batch_size=32,
                 num_workers=4):
        super().__init__()
        self.X_train, self.Y_train, self.train_files = X_train, Y_train, train_files
        self.X_val,   self.Y_val,   self.val_files   = X_val,   Y_val,   val_files
        self.X_test,  self.Y_test,  self.test_files  = X_test,  Y_test,  test_files
        self.classes     = classes
        self.batch_size  = batch_size
        self.num_workers = num_workers

    def setup(self, stage=None):
        self.train_ds = SequenceDataset(self.X_train, self.Y_train, self.classes, self.train_files, apply_log_mel=False)
        self.val_ds   = SequenceDataset(self.X_val,   self.Y_val,   self.classes, self.val_files, apply_log_mel=False)
        self.test_ds  = SequenceDataset(self.X_test,  self.Y_test,  self.classes, self.test_files, apply_log_mel=False)

    def train_dataloader(self):
        return DataLoader(self.train_ds,
                          batch_size=self.batch_size,
                          shuffle=True,
                          collate_fn=collate_fn,
                          num_workers=self.num_workers)
    def val_dataloader(self):
        return DataLoader(self.val_ds,
                          batch_size=self.batch_size,
                          shuffle=False,
                          collate_fn=collate_fn,
                          num_workers=self.num_workers)


    def test_dataloader(self):
        return DataLoader(self.test_ds,
                          batch_size=self.batch_size,
                          shuffle=False,
                          collate_fn=collate_fn,
                          num_workers=self.num_workers)

In [81]:
dm = SEDDataModule(
    X_train=X_train, Y_train=Y_train, train_files=train_files,
    X_val=X_val,     Y_val=Y_val,     val_files=val_files,
    X_test=X_test,   Y_test=Y_test,   test_files=test_files,
    classes=TARGET_CLASSES,
    batch_size=32,
    num_workers=2
)

dm.setup()
loader = dm.train_dataloader()
X_batch, Y_batch, len_batch, filenames = next(iter(loader))
print("DataModule batch -> X:", X_batch.shape,
      "\nY:", Y_batch.shape,
      "\nlengths:", len_batch,
      "\nfilenames:", filenames[:3], "...")

DataModule batch -> X: torch.Size([32, 249, 768]) 
Y: torch.Size([32, 249, 10]) 
lengths: tensor([167, 135, 209, 161, 226, 185, 143, 249, 149, 173, 210, 226, 220, 210,
        242, 152, 164, 230, 138, 173, 239, 188, 140, 193, 139, 184, 201, 183,
        207, 155, 205, 234]) 
filenames: ['250924.mp3', '28127.mp3', '646030.mp3'] ...


In [82]:
import torch
import torch.nn as nn

class LearnablePositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        self.d_model = d_model
        self.max_len = max_len

        # Create learnable positional embeddings
        self.pe = nn.Parameter(torch.zeros(max_len, d_model))  # Trainable parameter
        nn.init.normal_(self.pe, mean=0.0, std=0.02)  # Small random initialization

    def forward(self, x):
        # x shape: (batch_size, seq_len, d_model)
        seq_len = x.size(1)

        # Add positional encoding to each sequence element
        return x + self.pe[:seq_len, :]  # (B, T, d_model) + (T, d_model)

In [83]:
class Transformer(nn.Module):
    def __init__(self, num_classes=10, em_dim=768, reduced_dim=256, n_heads=4, n_layers=4, max_len=5000):
        super().__init__()

        self.pos_encoder = LearnablePositionalEncoding(d_model=1024, max_len=max_len)

        # Convolutional feature extractor on full em_dim
        # self.conv = nn.Sequential(
        #     nn.Conv1d(em_dim, em_dim, kernel_size=7, padding="same", groups=em_dim),
        #     nn.BatchNorm1d(em_dim),
        #     nn.ReLU(),

        #     nn.Conv1d(em_dim, em_dim, kernel_size=5, padding="same", groups=em_dim),
        #     nn.BatchNorm1d(em_dim),
        #     nn.ReLU(),

        #     nn.Conv1d(em_dim, em_dim, kernel_size=3, padding="same", groups=em_dim),
        #     nn.BatchNorm1d(em_dim),
        #     nn.ReLU(),
        # )

        self.proj = nn.Sequential(
                nn.Linear(em_dim, 1024),
                nn.GELU(),
                nn.Linear(1024, 1024)
            )

        # Transformer encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=1024,
            nhead=n_heads,
            dropout=0.2,
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)

        # Classifier
        self.classifier = nn.Sequential(
            nn.Linear(1024, reduced_dim),
            nn.GELU(),
            nn.Dropout(0.2),
            nn.Linear(reduced_dim, num_classes)
        )

    def forward(self, x, lengths=None):
        # x: (B, T, em_dim)
        # x = x.transpose(1, 2)      # (B, em_dim, T)
        # x = self.conv(x)           # (B, em_dim, T)
        # x = x.transpose(1, 2)      # (B, T, em_dim)

        x = self.proj(x)        # (B, T, reduced_dim)
        x = self.pos_encoder(x)    # (B, T, reduced_dim)
        x = self.transformer(x)    # (B, T, reduced_dim)
        return self.classifier(x)  # (B, T, num_classes)

In [84]:
class SEDLightningModule(pl.LightningModule):
    def __init__(self, classes, lr=1e-4, threshold=0.5, dropout=0.3, pos_weight=None):
        super().__init__()
        # Core model
        self.model = Transformer()

        self.classes = classes
        self.lr = lr
        self.threshold = threshold

        # Store pos_weight as a buffer if it's a tensor
        if pos_weight is not None and isinstance(pos_weight, torch.Tensor):
            self.register_buffer("pos_weight_tensor", pos_weight)
        else:
            self.pos_weight_tensor = pos_weight

        # Initialize criterion without pos_weight (we'll handle it in training_step)
        self.criterion = nn.BCEWithLogitsLoss(reduction='none')

        self._val_preds = {c: [] for c in self.classes}
        self._val_targets = {c: [] for c in self.classes}
        self._val_filenames = []

    def forward(self, x, lengths):
        return self.model(x, lengths)

    def predict_step(self, batch, batch_idx):
        # unpack batch (with or without labels)
        if len(batch) == 4:
            X, _, lengths, filenames = batch
        else:
            X, lengths, filenames = batch

        # 1) raw logits → probs → binary preds
        logits = self.model(X, lengths)
        probs = torch.sigmoid(logits)
        preds = (probs > self.threshold).int()  # (B, T_max, C)

        # 2) remove padding
        batch_preds = [preds[b, :lengths[b]].cpu()
                      for b in range(X.size(0))]

        return {"filenames": filenames, "preds": batch_preds}

    def training_step(self, batch, batch_idx):
        X, Y, lengths, _ = batch
        logits = self(X, lengths)

        # Create criterion with current pos_weight
        criterion = nn.BCEWithLogitsLoss(
            pos_weight=self.pos_weight_tensor.to(logits.device) if self.pos_weight_tensor is not None else None,
            reduction='none'
        )

        loss_raw = criterion(logits, Y.float())
        mask = torch.arange(logits.size(1), device=logits.device)[None, :] < lengths[:, None]
        mask = mask.unsqueeze(-1).float()
        loss = (loss_raw * mask).sum() / mask.sum()

        self.log('train/loss', loss, prog_bar=True, on_step=True, on_epoch=True, batch_size=X.size(0))
        return loss

    def validation_step(self, batch, batch_idx):
        return self.process_validation_step(batch, batch_idx)

    def on_validation_epoch_end(self):
        return self.process_validation_epoch_end()

    def test_step(self, batch, batch_idx):
        return self.process_test_step(batch, batch_idx)

    def on_test_epoch_end(self):
        return self.process_test_epoch_end()

    def configure_optimizers(self):
        decay = []
        no_decay = []

        for name, param in self.model.named_parameters():
            if param.requires_grad:
                if 'bias' in name or 'bn' in name or 'norm' in name:
                    no_decay.append(param)
                else:
                    decay.append(param)

        optimizer_grouped = [
            {'params': decay, 'weight_decay': 1e-4},
            {'params': no_decay, 'weight_decay': 0.0}
        ]

        return torch.optim.AdamW(optimizer_grouped, lr=self.lr)

    @classmethod
    def load_from_checkpoint(cls, checkpoint_path, **kwargs):
        # First load the checkpoint without strict state_dict checking
        model = super().load_from_checkpoint(checkpoint_path, strict=False, **kwargs)

        # Manually handle the pos_weight if it exists in the checkpoint
        checkpoint = torch.load(checkpoint_path)
        if 'criterion.pos_weight' in checkpoint['state_dict']:
            pos_weight = checkpoint['state_dict']['criterion.pos_weight']
            if isinstance(pos_weight, torch.Tensor):
                model.register_buffer("pos_weight_tensor", pos_weight)
            else:
                model.pos_weight_tensor = pos_weight

        return model

In [85]:
def process_training_step(self, batch, batch_idx):
    X, Y, lengths, _ = batch      # X: (B, T, D), Y: (B, T, C), lengths: (B,)
    logits = self(X,lengths)     # calls self.forward, results in logits of shape (B, T, C)


    # raw per-element loss
    loss_raw = self.criterion(logits, Y.float())  # (B, T, C)

    # build mask to zero out padded frames
    mask = torch.arange(logits.size(1), device=logits.device)[None, :] < lengths[:, None]
    mask = mask.unsqueeze(-1).float()     # (B, T, 1)

    # apply mask and average
    loss = (loss_raw * mask).sum() / mask.sum()

    self.log('train/loss', loss, prog_bar=True, on_step=True, on_epoch=True, batch_size=X.size(0))
    return loss

# Bind it to the LightningModule
SEDLightningModule.process_training_step = process_training_step

In [86]:
def process_validation_step(self, batch, batch_idx):
    X, Y, lengths, filenames = batch      # X: (B, T, D), Y: (B, T, C), lengths: (B,)
    logits = self(X,lengths)             # calls self.forward, results in logits of shape (B, T, C)

    # Determine logging prefix
    prefix = "test" if self.trainer.testing else "val"

    # compute masked BCE loss
    loss_raw = self.criterion(logits, Y.float())     # (B, T, C)
    mask = torch.arange(logits.size(1), device=logits.device)[None, :] < lengths[:, None]
    mask = mask.unsqueeze(-1).float()                # (B, T, 1)
    loss = (loss_raw * mask).sum() / mask.sum()

    self.log(f'{prefix}/loss', loss, prog_bar=True, on_step=False, on_epoch=True, batch_size=X.size(0))

    # store frame-wise preds & targets for epoch_end
    # frame-wise logits are thresholded here
    preds = (torch.sigmoid(logits) > self.threshold).long()     # (B, T, C)
    self._val_filenames.extend(filenames)

    for i, c in enumerate(self.classes):
        for b in range(X.size(0)):
            T = lengths[b]
            self._val_preds[c].append(preds[b, :T, i])
            self._val_targets[c].append(Y[b, :T, i])

    return loss

# Bind it to the LightningModule
SEDLightningModule.process_validation_step = process_validation_step

In [87]:
def process_validation_epoch_end(self):
    # Determine current mode
    prefix = "test" if self.trainer.testing else "val"

    # --- 1) Convert buffered tensors to NumPy arrays ---
    preds_numpy = {
        cls: [p.cpu().numpy() for p in self._val_preds[cls]]
        for cls in self.classes
    }
    targets_numpy = {
        cls: [t.cpu().numpy() for t in self._val_targets[cls]]
        for cls in self.classes
    }

    # --- 2) Frame‐level metrics ---
    frame_metrics = evaluate_classifiers(
        classes=self.classes,
        Y_val=targets_numpy,
        Y_pred=preds_numpy
    )

    for cls, m in frame_metrics.items():
        self.log(f'{prefix}/{cls}_bacc',     m['balanced_accuracy'])
        self.log(f'{prefix}/{cls}_precision',m['precision'])
        self.log(f'{prefix}/{cls}_recall',   m['recall'])
        self.log(f'{prefix}/{cls}_f1',       m['f1'])

    # --- 3) Segment‐level cost ---
    total_cost, cost_breakdown = evaluate_cost(
        val_files=self._val_filenames,
        dataset_path=DATASET_PATH,
        classes=self.classes,
        Y_pred=preds_numpy
    )
    self.log(f'{prefix}/total_cost', total_cost, prog_bar=True)
    for cls, cls_cost in cost_breakdown.items():
        self.log(f"{prefix}/cost/{cls}", cls_cost["cost"], prog_bar=False)

    # --- 4) Clear buffers ---
    self._val_preds     = {c: [] for c in self.classes}
    self._val_targets   = {c: [] for c in self.classes}
    self._val_filenames = []


SEDLightningModule.process_validation_epoch_end = process_validation_epoch_end

In [88]:
# After you’ve attached the validation logic, simply reuse it for testing:

# Reuse the same step‐logic
SEDLightningModule.process_test_step = SEDLightningModule.process_validation_step

# Reuse the same epoch‐end logic
SEDLightningModule.process_test_epoch_end = SEDLightningModule.process_validation_epoch_end

In [89]:
hparams = dict(
    input_dim      = X_batch.shape[-1],   # Feature dimension (e.g., 64 for Mel)
    num_classes    = Y_batch.shape[-1],   # Number of output classes
    dropout        = 0.3,                 # Dropout
    lr             = 1e-5,                # Learning rate
    pos_weight     = torch.tensor(pos_weights),          # To handle class imbalance
    batch_size     = 32,
    max_epochs     = 100,
    threshold      = 0.5,
    patience       = 10                    # For early stopping
)

In [90]:
checkpoint_cb = ModelCheckpoint(
    monitor    = "val/total_cost",   # minimize cost
    mode       = "min",
    save_top_k = 1,                  # save top model on validation data
    filename   = "best-{epoch:02d}"
)

early_stop_cb = EarlyStopping(
    monitor  = "val/total_cost",
    mode     = "min",
    patience = hparams["patience"],
    verbose  = True
)

lr_monitor_cb = LearningRateMonitor(logging_interval="epoch")

# RichProgressBar generates minimal output compared to 'tqdm'
progress_bar_cb = RichProgressBar()

callbacks = [checkpoint_cb, early_stop_cb, lr_monitor_cb, progress_bar_cb]

In [91]:
wandb_logger = WandbLogger(
    project     = "mlpc2025",
    name        = "1l-projectiontransformer-786 -FINAL VERSION(2)",
    config      = hparams
)

In [92]:
dm = SEDDataModule(
    X_train=X_train, Y_train=Y_train, train_files=train_files,
    X_val=X_val,     Y_val=Y_val,     val_files=val_files,
    X_test=X_test,   Y_test=Y_test,   test_files=test_files,
    classes=TARGET_CLASSES,
    batch_size=hparams["batch_size"],
    num_workers=2
)

model = SEDLightningModule(
    #input_dim    = hparams["input_dim"],
    classes      = TARGET_CLASSES,
    #cnn_channels = hparams["cnn_channels"],

    dropout      = hparams["dropout"],
    lr           = hparams["lr"],
    threshold    = hparams["threshold"],
    pos_weight   = hparams["pos_weight"]
)

trainer = pl.Trainer(
    accelerator             = "gpu",
    devices                 = 1,
    max_epochs              = hparams["max_epochs"],
    callbacks               = callbacks,
    logger                  = wandb_logger,
    log_every_n_steps       = 10,
    deterministic           = True,
    check_val_every_n_epoch = 1,
    num_sanity_val_steps    = 0,
    precision               = 16
)

/usr/local/lib/python3.11/dist-packages/lightning_fabric/connector.py:571: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!
INFO:pytorch_lightning.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [93]:
trainer.fit(model, datamodule=dm)

/usr/local/lib/python3.11/dist-packages/pytorch_lightning/loggers/wandb.py:397: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/usr/local/lib/python3.11/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory ./mlpc2025/23ib56xo/checkpoints exists and is not empty.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

INFO:pytorch_lightning.callbacks.early_stopping:Metric val/total_cost improved. New best score: 61.784
INFO:pytorch_lightning.callbacks.early_stopping:Metric val/total_cost improved by 15.952 >= min_delta = 0.0. New best score: 45.831
INFO:pytorch_lightning.callbacks.early_stopping:Metric val/total_cost improved by 5.997 >= min_delta = 0.0. New best score: 39.835
INFO:pytorch_lightning.callbacks.early_stopping:Metric val/total_cost improved by 1.002 >= min_delta = 0.0. New best score: 38.833
INFO:pytorch_lightning.callbacks.early_stopping:Metric val/total_cost improved by 0.312 >= min_delta = 0.0. New best score: 38.521
INFO:pytorch_lightning.callbacks.early_stopping:Metric val/total_cost improved by 1.223 >= min_delta = 0.0. New best score: 37.299
INFO:pytorch_lightning.callbacks.early_stopping:Monitored metric val/total_cost did not improve in the last 10 records. Best score: 37.299. Signaling Trainer to stop.


In [94]:
 test_results = trainer.test(model, datamodule=dm, ckpt_path="best")

INFO:pytorch_lightning.utilities.rank_zero:Restoring states from the checkpoint path at ./mlpc2025/23ib56xo/checkpoints/best-epoch=05-v1.ckpt
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loaded model weights from the checkpoint at ./mlpc2025/23ib56xo/checkpoints/best-epoch=05-v1.ckpt


Output()

**MODEL PREDICTIONS ON UNSEEN DATASET**

In [95]:
# First make sure you have these variables defined (from your training code)
classes = TARGET_CLASSES
threshold = 0.5  # Your threshold value
lr = 1e-5  # Your learning rate

# Load the checkpoint with the updated class
checkpoint_path = trainer.checkpoint_callback.best_model_path
if os.path.exists(checkpoint_path):
    model = SEDLightningModule.load_from_checkpoint(
        checkpoint_path,
        classes=classes,
        lr=lr,
        threshold=threshold,
        pos_weight=torch.tensor(pos_weights)  # Make sure to pass the same pos_weights
    )
else:
    print("Checkpoint not found at:", checkpoint_path)

In [96]:
def predict_dataset(
    model: pl.LightningModule,
    loader: DataLoader
) -> dict[str, dict[str, np.ndarray]]:
    """
    Runs trainer.predict() on `loader` and returns:
      preds_by_file[filename][class] = 1D NumPy array of frame‐wise {0,1}.
    """
    trainer = pl.Trainer(accelerator="auto", devices=1)
    outputs = trainer.predict(model, dataloaders=loader)

    # flatten into lists
    all_preds = {c: [] for c in model.classes}
    all_files = []
    for batch_out in outputs:
        for fname, pred in zip(batch_out["filenames"], batch_out["preds"]):
            all_files.append(fname)
            arr = pred.numpy()  # shape (T_i, C)
            for i, cls in enumerate(model.classes):
                all_preds[cls].append(arr[:, i])

    # repackage into preds_by_file
    preds_by_file: dict[str, dict[str, np.ndarray]] = {}
    for idx, fname in enumerate(all_files):
        preds_by_file.setdefault(fname, {})
        for cls in model.classes:
            preds_by_file[fname][cls] = all_preds[cls][idx]

    return preds_by_file

In [97]:
def segment_and_save(
    preds_by_file: dict[str, dict[str, np.ndarray]],
    class_names: list[str],
    dataset_path: str,
    out_csv: str,
    compute_cost: bool = False,
    test_files: list[str] = None,
) -> pd.DataFrame:
    """
    1) Build segment‐level DataFrame
    2) Sanity‐check with check_dataframe()
    3) (optional) compute & print cost if val_files is provided
    4) save CSV to out_csv
    """
    # 1) aggregate predictions using the function provided in compute_cost.py
    pred_df = get_segment_prediction_df(
        predictions = preds_by_file,
        class_names = class_names
    )

    # 2) sanity‐check (from compute_cost.py)
    check_dataframe(pred_df, dataset_path)

    # 3) cost (optional), for sanity check on our custom test split
    if compute_cost and test_files is not None:
        gt_df = get_ground_truth_df(test_files, dataset_path) # from compute_cost.py
        total, breakdown = total_cost(pred_df, gt_df) # from compute_cost.py
        print(f"\nTotal cost: {total:.4f}")

        gt_csv = os.path.splitext(out_csv)[0] + "_ground_truth.csv"
        gt_df.to_csv(gt_csv, index=False)
        print(f"Saved ground truth segments to {gt_csv}")

    # 4) save
    pred_df.to_csv(out_csv, index=False)
    print(f"Saved segment predictions to {out_csv}")

    return pred_df

In [98]:
# 2) CUSTOMER SET (no labels → compute_cost=False)
customer_files = CUSTOMER_METADATA["filename"].unique()
X_cust, _ = read_files(customer_files, TARGET_CLASSES,
                       features_dir=CUSTOMER_AUDIO_FEATURES_DIR,
                       labels_dir=None)
cust_dataset = SequenceDataset(X_cust, None, TARGET_CLASSES, customer_files)
cust_loader  = DataLoader(cust_dataset, batch_size=8, collate_fn=collate_fn)

cust_preds = predict_dataset(model, cust_loader)
segment_and_save(
    preds_by_file = cust_preds,
    class_names   = TARGET_CLASSES,
    dataset_path  = CUSTOMER_DATASET_PATH,
    out_csv       = "customer_predictions.csv",
    compute_cost  = False,  # can't compute on customer's secret test set
)

INFO:pytorch_lightning.utilities.rank_zero:Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/usr/local/lib/python3.11/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Predicting: |          | 0/? [00:00<?, ?it/s]

Saved segment predictions to customer_predictions.csv


Unnamed: 0,filename,onset,Speech,Shout,Chainsaw,Jackhammer,Lawn Mower,Power Drill,Dog Bark,Rooster Crow,Horn Honk,Siren
0,386984.mp3,0.0,1,1,0,0,0,0,0,0,0,0
1,386984.mp3,1.2,1,1,0,0,0,0,0,0,0,0
2,386984.mp3,2.4,1,1,0,0,0,0,0,0,0,0
3,386984.mp3,3.6,1,1,0,0,0,0,0,0,0,0
4,386984.mp3,4.8,1,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
52186,507531.mp3,21.6,0,0,0,0,0,0,0,0,0,0
52187,507531.mp3,22.8,0,0,0,0,0,0,0,0,0,0
52188,507531.mp3,24.0,0,0,0,0,0,0,0,0,0,0
52189,507531.mp3,25.2,0,0,0,0,0,0,0,0,0,0
