In [17]:
import pathlib
from pathlib import Path
import os
import re

import pandas as pd
import numpy as np

import tensorflow as tf
import keras
from keras import layers, Model, optimizers
import librosa


import scipy
from tqdm import tqdm
from sklearn.metrics import roc_curve, auc, roc_auc_score

In [18]:
sample_rate = 16000
print(type(sample_rate))

<class 'int'>


In [19]:
# mention absolute import path w.r.t the current directory

base_dir = Path.cwd().parent
base_dir

PosixPath('/Users/AnanyaPal1/Documents/TUD/5th semester/resampling and simulations/anomaly-detection-in-sound-data')

In [20]:
# 3) Parameter (alle als Integer, wo nötig)

sample_rate = 16000
n_fft = 1024
hop_length = 512
n_mels = 128

P = 2                       # Kontext links/rechts
n_context = 2 * P + 1          # 5 Frames
input_dim = n_mels * n_context    # 128 * 5 = 640

train_dir = base_dir / "data" / "fan" / "train"
test_dir  = base_dir / "data" / "fan" / "test"

epochs_ae = 100
batch_size_ae = 512

Compute context stacked log-mel features for the audio files.

In [21]:
def compute_logmel_context(file_path,
                           sample_rate=16000,
                           n_fft=1024,
                           hop_length=512,
                           n_mels=128,
                           P=2):
    """
    Compute context-stacked log-mel features for a single audio file.

    Returns
    -------
    out : np.ndarray of shape (T_eff, n_mels * (2P+1))
        Each row is a flattened (n_mels x (2P+1)) log-mel context window.
        Returns None if the file is too short to form one full context window.
    """
    n_context = 2 * P + 1
    input_dim = n_mels * n_context

    # 1) Load waveform (mono) at target sample rate
    y, sr = librosa.load(file_path, sr=sample_rate, mono=True)

    # 2) Mel spectrogram (power)
    mel = librosa.feature.melspectrogram(
        y=y,
        sr=sample_rate,
        n_fft=n_fft,
        hop_length=hop_length,
        n_mels=n_mels
    )

    # 3) Convert power -> dB (log scale), reference to max (like ref=np.max)
    mel_db = librosa.power_to_db(mel, ref=np.max)

    # Ensure shape is (n_mels, T)
    M = np.asarray(mel_db)
    if M.ndim != 2:
        M = M.reshape(n_mels, -1)

    n_frames = M.shape[1]

    # 4) Discard too-short files
    if n_frames < n_context:
        return None

    # 5) Build context-stacked features
    T_eff = n_frames - 2 * P
    out = np.zeros((T_eff, input_dim), dtype=np.float32)

    idx = 0
    for t in range(P, n_frames - P):  # 0-based: centers P .. n_frames-P-1
        win = M[:, (t - P):(t + P + 1)]      # shape: (n_mels, n_context)
        out[idx, :] = win.reshape(-1, order="F")  # match R as.numeric() column-major flatten
        idx += 1

    return out


Load Data and ID

In [22]:
train_files = list(train_dir.rglob("*.wav"))
test_files  = list(test_dir.rglob("*.wav"))

display(f"number of train files:{len(train_files)}")
display(f"number of test files: {len(test_files)}")

'number of train files:3675'

'number of test files: 1875'

In [23]:
type(train_files)

list

In [24]:
def get_id(x):
    return re.sub(r".*id_(..)_.*", r"\1", os.path.basename(x))

train_ids  =  [get_id(p) for p in train_files]
test_ids   = [get_id(p) for p in test_files]
unique_ids = sorted(set(train_ids))

Implement an AE.
Remark: Settings specific to the structure of the AE (hyperparameters, structure) are taken from the benchmark AE method from the following research paper.


In [25]:
def build_model(input_dim):

    input = layers.Input(shape=(input_dim,))

    # Encoder

    x = layers.Dense(128)(input)
    x = layers.BatchNormalization()(x)
    x = layers.Activation("relu")(x)

    x = layers.Dense(128)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation("relu")(x)

    x = layers.Dense(128)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation("relu")(x)

    x = layers.Dense(128)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation("relu")(x)

    # Bottleneck 8D
    x = layers.Dense(8)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation("relu")(x)

    # Decoder
    x = layers.Dense(128)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation("relu")(x)

    x = layers.Dense(128)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation("relu")(x)

    x = layers.Dense(128)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation("relu")(x)

    x = layers.Dense(128)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation("relu")(x)

    output = layers.Dense(input_dim)(x)

    model = keras.Model(input, output)

    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=1e-3),
        loss = "mse"
    )

    return model


In [26]:
print(type(train_ids), np.shape(train_ids))
print(type(test_ids), np.shape(test_ids))
print(train_ids)
print(test_ids)


<class 'list'> (3675,)
<class 'list'> (1875,)
['04', '00', '06', '00', '06', '02', '04', '04', '06', '02', '04', '04', '02', '00', '00', '04', '02', '02', '04', '06', '06', '00', '04', '00', '06', '00', '00', '06', '04', '06', '00', '00', '06', '02', '04', '06', '00', '06', '00', '02', '04', '04', '02', '04', '02', '02', '04', '06', '00', '00', '06', '00', '06', '04', '02', '06', '00', '00', '06', '06', '00', '06', '00', '04', '02', '04', '02', '02', '04', '02', '04', '04', '02', '00', '06', '06', '00', '06', '00', '00', '06', '06', '00', '04', '02', '06', '00', '06', '02', '02', '00', '06', '06', '04', '02', '02', '04', '00', '02', '04', '04', '02', '06', '00', '06', '04', '02', '06', '00', '06', '00', '02', '02', '04', '00', '06', '06', '00', '02', '02', '00', '06', '04', '02', '02', '00', '06', '06', '00', '04', '06', '00', '00', '06', '02', '04', '02', '06', '02', '04', '02', '04', '06', '00', '04', '02', '02', '06', '00', '06', '00', '06', '02', '04', '02', '00', '06', '00', '06',

Per-ID Training and Evaluation

In [27]:
results = []

train_ids = np.array(train_ids)
test_ids = np.array(test_ids)

for id in unique_ids:

    # data per ID
    train_idx = np.where(train_ids == id)[0]
    test_idx = np.where(test_ids == id)[0]

    train_feat = [compute_logmel_context(f) for f in tqdm([train_files[i] for i in train_idx])]
    test_feat  = [compute_logmel_context(f) for f in tqdm([test_files[i]  for i in test_idx])]

    train_feat = [f for f in train_feat if f is not None]
    test_feat = [f for f in test_feat if f is not None]

    if len(train_feat) == 0 or len(test_feat) ==0:
        print("No valid features for ID", id, "will be skipped.")
        continue

    # stack (frames x 640)
    X_train = np.vstack(train_feat)
    X_test = np.vstack(test_feat)
 
    # Clip labels (1 = anomaly, 0 = normal)
    y_test_clip = np.array([
    1 if "anomaly" in os.path.basename(test_files[i]).lower() else 0
    for i in test_idx
    ], dtype=int)

    # number of frames per clip
    frame_counts = [feat.shape[0] for feat in test_feat]   # each feat = frames × 640

    # frame → clip group assignment
    test_groups = np.repeat(np.arange(1, len(frame_counts) + 1), frame_counts)
    
    # build the model and training

    model = build_model(input_dim=input_dim)

    history = model.fit(
        X_train, X_train,
        epochs = epochs_ae,
        batch_size = batch_size_ae,
        shuffle = True,
        verbose = 1
    )

    # reconstruction and scores

    X_pred = model.predict(X_test, batch_size = batch_size_ae)

    frame_scores = np.mean((X_test - X_pred)**2, axis=1)

    # clip-level score = average of frames per clip
    groups = np.unique(test_groups)
    clip_scores = np.array([frame_scores[test_groups==g].mean() for g in groups])


    # AUC - pAUC (DCASE)
    auc_val = roc_auc_score(y_test_clip, clip_scores)

    pauc_val = roc_auc_score(y_test_clip, clip_scores, max_fpr=0.1)

    print("AUC: ", float(auc_val))
    print("pAUC:", float(pauc_val))

    results.append({
        "auc": float(auc_val),
        "pauc": float(pauc_val),
        "scores": clip_scores,
        "labels": y_test_clip
    })

all_aucs = np.array([x["auc"] for x in results])
print("All scores: ", all_aucs)

mean = np.mean(all_aucs)
print("Mean of all AUCs", mean)


100%|██████████| 911/911 [00:03<00:00, 245.32it/s]
100%|██████████| 507/507 [00:02<00:00, 195.38it/s]


Epoch 1/100
[1m550/550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8ms/step - loss: 235.3242
Epoch 2/100
[1m550/550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - loss: 13.9418
Epoch 3/100
[1m550/550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - loss: 12.2383
Epoch 4/100
[1m550/550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - loss: 11.4044
Epoch 5/100
[1m550/550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - loss: 11.0160
Epoch 6/100
[1m550/550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - loss: 10.7639
Epoch 7/100
[1m550/550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - loss: 10.5552
Epoch 8/100
[1m550/550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - loss: 10.3723
Epoch 9/100
[1m550/550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 14ms/step - loss: 10.2464
Epoch 10/100
[1m550/550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1

100%|██████████| 916/916 [00:03<00:00, 248.56it/s]
100%|██████████| 459/459 [00:02<00:00, 163.86it/s]


Epoch 1/100
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 8ms/step - loss: 178.7704
Epoch 2/100
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - loss: 12.4796
Epoch 3/100
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - loss: 11.5818
Epoch 4/100
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - loss: 10.9042
Epoch 5/100
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - loss: 10.4917
Epoch 6/100
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - loss: 10.2550
Epoch 7/100
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - loss: 10.0784
Epoch 8/100
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - loss: 9.9442
Epoch 9/100
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - loss: 9.8170
Epoch 10/100
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s

100%|██████████| 933/933 [00:03<00:00, 251.56it/s]
100%|██████████| 448/448 [00:01<00:00, 238.09it/s]


Epoch 1/100
[1m564/564[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 8ms/step - loss: 204.0138
Epoch 2/100
[1m564/564[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - loss: 13.3082
Epoch 3/100
[1m564/564[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - loss: 12.0776
Epoch 4/100
[1m564/564[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - loss: 11.4081
Epoch 5/100
[1m564/564[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - loss: 10.7987
Epoch 6/100
[1m564/564[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - loss: 10.4680
Epoch 7/100
[1m564/564[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - loss: 10.2442
Epoch 8/100
[1m564/564[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - loss: 10.1304
Epoch 9/100
[1m564/564[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - loss: 10.0274
Epoch 10/100
[1m564/564[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

100%|██████████| 915/915 [00:03<00:00, 271.67it/s]
100%|██████████| 461/461 [15:59<00:00,  2.08s/it] 


Epoch 1/100
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m707s[0m 1s/step - loss: 206.5579
Epoch 2/100
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - loss: 13.3964
Epoch 3/100
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - loss: 12.1185
Epoch 4/100
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - loss: 11.3942
Epoch 5/100
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - loss: 10.7577
Epoch 6/100
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - loss: 10.3904
Epoch 7/100
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - loss: 10.1303
Epoch 8/100
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - loss: 9.9340
Epoch 9/100
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - loss: 9.7798
Epoch 10/100
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4