In [31]:
import numpy as np
import pandas as pd
import os
import ast

DATA_SET_PATH = "C:/Users/simon/Downloads/MLPC2025_classification"

annotations = pd.read_csv(f"{DATA_SET_PATH}/annotations.csv")

# get class count
# Notice that each annotation can have multiple classes in the form of:  df.at[0, "categories"] = ["class1", "class2"]
annotations["categories"] = annotations["categories"].apply(ast.literal_eval)
exploded_categories = annotations.explode("categories")
labels_count = exploded_categories["categories"].value_counts().sort_index()

# example label and feature file
example_label_file = np.load(f"{DATA_SET_PATH}/labels/14_labels.npz")
labels_unique = example_label_file.files
example_feature_file = np.load(f"{DATA_SET_PATH}/audio_features/14.npz")
features_unique = example_feature_file.files

# count frame level occurances of classes
frame_counts = np.zeros(58, dtype=np.int64)
for npz in os.listdir(f"{DATA_SET_PATH}/labels/"):
    data = np.load(f"{DATA_SET_PATH}/labels/{npz}")
    frame_counts += np.array([data[label].sum() for label in data.files], dtype=np.int64)
frame_counts = pd.Series(frame_counts, index=labels_unique)


# some summary stats
median_samples = labels_count.median()
mean_samples = labels_count.mean()
min_samples = labels_count.min()
max_samples = labels_count.max()
std_samples = np.sqrt(labels_count.var())

median_samples_frames = frame_counts.median()
mean_samples_frames = frame_counts.mean()
min_samples_frames = frame_counts.min()
max_samples_frames = frame_counts.max()
std_samples_frames = np.sqrt(frame_counts.var())


print(f"Annotations has the following columns:\n{annotations.columns}")
print("-" * 100)

print("In the Classification task description it says the dataset has 54 labels but we actually have 58")
print(f"The .npz label files have the following labels as keys (=classes):\n", labels_unique)
print("-" * 100)

print("Regarding the features I think they are the same as in the data exploration task, so for dimensionality of the features look at the corresponding slides or the notebook i uploaded to github")
print(f"The .npz files have the following features as keys:\n", features_unique)
print("-" * 100)

print("Some summary stats for class distributions: ")
print(f"Median samples:", median_samples)
print(f"Mean samples:", mean_samples)
print(f"Min samples:", min_samples)
print(f"Max samples:", max_samples)
print(f"Std of samples:", std_samples)
print("\n")
print(f"Events per class:\n", labels_count)
print("-" * 100)

print("Some summary stats for class distributions per frames: ")
print(f"Median samples:", median_samples_frames)
print(f"Mean samples:", mean_samples_frames)
print(f"Min samples:", min_samples_frames)
print(f"Max samples:", max_samples_frames)
print(f"Std of samples:", std_samples_frames)
print("\n")
print(f"Frame level samples per class:\n{frame_counts}")

Annotations has the following columns:
Index(['task_id', 'filename', 'annotator', 'text', 'onset', 'offset', 'time',
       'original_caption', 'categories'],
      dtype='object')
----------------------------------------------------------------------------------------------------
In the Classification task description it says the dataset has 54 labels but we actually have 58
The .npz label files have the following labels as keys (=classes):
 ['Airplane', 'Alarm', 'Beep/Bleep', 'Bell', 'Bicycle', 'Bird Chirp', 'Bus', 'Car', 'Cat Meow', 'Chainsaw', 'Clapping', 'Cough', 'Cow Moo', 'Cowbell', 'Crying', 'Dog Bark', 'Doorbell', 'Drip', 'Drums', 'Fire', 'Footsteps', 'Guitar', 'Hammer', 'Helicopter', 'Hiccup', 'Horn Honk', 'Horse Neigh', 'Insect Buzz', 'Jackhammer', 'Laughter', 'Lawn Mower', 'Motorcycle', 'Piano', 'Pig Oink', 'Power Drill', 'Power Saw', 'Rain', 'Rooster Crow', 'Saxophone', 'Sewing Machine', 'Sheep/Goat Bleat', 'Ship/Boat', 'Shout', 'Singing', 'Siren', 'Sneeze', 'Snoring', 'Sp