In [2]:
import numpy as np
import pandas as pd
import os 
import matplotlib.pyplot as plt
import sklearn
import ast

In [41]:
DATA_SET_PATH = "C:/Users/simon/Downloads/MLPC2025_classification"

metadata = pd.read_csv(os.path.join(DATA_SET_PATH, 'metadata.csv'))
files = metadata["filename"]

features_dir = 'audio_features'
labels_dir = 'labels'
categories = ['Airplane', 'Alarm', 'Beep/Bleep', 'Bell', 'Bicycle', 'Bird Chirp', 'Bus', 'Car', 'Cat Meow',
        'Chainsaw', 'Clapping', 'Cough', 'Cow Moo', 'Cowbell', 'Crying', 'Dog Bark', 'Doorbell', 'Drip',
        'Drums', 'Fire', 'Footsteps', 'Guitar', 'Hammer', 'Helicopter', 'Hiccup', 'Horn Honk', 'Horse Neigh',
        'Insect Buzz', 'Jackhammer', 'Laughter', 'Lawn Mower', 'Motorcycle', 'Piano', 'Pig Oink', 'Power Drill',
        'Power Saw', 'Rain', 'Rooster Crow', 'Saxophone', 'Sewing Machine', 'Sheep/Goat Bleat', 'Ship/Boat',
        'Shout', 'Singing', 'Siren', 'Sneeze', 'Snoring', 'Speech', 'Stream/River', 'Thunder', 'Train', 'Truck',
        'Trumpet', 'Vacuum Cleaner', 'Violin', 'Washing Machine', 'Waves', 'Wind']
print(f"We have {len(files)} training files")
print(f"We have {len(categories)} labels")

We have 8230 training files
We have 58 labels


In [8]:
# functio
def aggregate_labels(file_labels):
    __y = []
    for frame_labels in file_labels:
        if(sum(frame_labels) == 0):
            __y.append([0])
        elif(np.count_nonzero(frame_labels) == len(frame_labels)):
             __y.append([1])
        else: #The annotators don't agree on the label
            __y.append([np.random.choice(frame_labels)])
    return __y

In [9]:
import itertools
def read_files(file_names, num_to_read=1000):
    X_train = []
    Y_train = {}
    for c in categories:
        Y_train[c] = []
    for f in file_names[:num_to_read]: #we are not loading the entire dataset due to processing time
        if not os.path.exists(os.path.join(DATA_SET_PATH, features_dir , f.split('.')[0] + '.npz')):
            continue
        features = np.load(os.path.join(DATA_SET_PATH, features_dir , f.split('.')[0] + '.npz'))["embeddings"]
        X_train.append(features)
        y = np.load(os.path.join(DATA_SET_PATH, labels_dir , f.split('.')[0] + '_labels.npz'))
        for c in categories:
            _y = aggregate_labels(y[c])
            Y_train[c].extend(list(itertools.chain.from_iterable(_y)))
    X_train = np.concatenate(X_train)
    return X_train, Y_train

In [22]:
# if we perform splits on the frame level highly correlated frames would be distributed accross all splits
# Therefore,
# we perform splits on the file level to avoid data leakage

nf = len(files)
sampled_files = files.sample(nf, random_state=0)

train_files = sampled_files[:int(nf*0.7)]
val_files = sampled_files[int(nf*0.7):int(nf*0.9)]
test_files = sampled_files[int(nf*0.9):]

# X_train, Y_train = read_files(train_files, 500)

In [25]:
# calculating class distributions across splits
all_counts = []
for name, split in zip(["Train", "Val", "Test"],[train_files, val_files, test_files]):
    frame_counts = np.zeros(58, dtype=np.int64)
    for f in split:
        data = np.load(f"{DATA_SET_PATH}/labels/{f.split('.')[0] + '_labels.npz'}")
        frame_counts += np.array([data[label].sum() for label in data.files], dtype=np.int64)
    frame_counts = pd.Series(frame_counts, index=categories)
    normalized_counts = frame_counts / frame_counts.sum()
    all_counts.append(normalized_counts.copy())
    # plt.bar(categories, normalized_counts)
    # plt.show()
    print(f"Split {name} has the following class distributions:\n{normalized_counts} ")
    print("-" * 100)

diff_train_val = np.array(all_counts[0]) - np.array(all_counts[1])
diff_train_test = np.array(all_counts[0]) - np.array(all_counts[2])
diff_val_test = np.array(all_counts[1]) - np.array(all_counts[2])
print("-" * 100)
print(f"Difference Train-Val {diff_train_val}")
print(f"Difference Train-Test {diff_train_val}")
print(f"Difference Val-Test {diff_train_val}")

Split Train has the following class distributions:
Airplane            0.022927
Alarm               0.008698
Beep/Bleep          0.012637
Bell                0.027412
Bicycle             0.008830
Bird Chirp          0.066897
Bus                 0.015019
Car                 0.049737
Cat Meow            0.003383
Chainsaw            0.007219
Clapping            0.016834
Cough               0.002985
Cow Moo             0.003697
Cowbell             0.004202
Crying              0.009846
Dog Bark            0.016947
Doorbell            0.000915
Drip                0.015854
Drums               0.020464
Fire                0.016984
Footsteps           0.023168
Guitar              0.029611
Hammer              0.009156
Helicopter          0.009925
Hiccup              0.000519
Horn Honk           0.012135
Horse Neigh         0.001357
Insect Buzz         0.021828
Jackhammer          0.007493
Laughter            0.015968
Lawn Mower          0.006745
Motorcycle          0.021571
Piano               0

In [39]:
print(max(diff_val_test))
print(np.argmax(diff_train_test))
print(categories[45])
print(min(all_counts[2]))
print(np.argmin(all_counts[2]))

0.01695079248460398
5
Sneeze
0.0002297233942803563
45
