# Label counting to determine dataset size

## Label mapping from ontology to datasets


In [2]:
import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer


2025-12-08 22:47:17.523840: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Setting up embeddings

In [3]:
model = SentenceTransformer("google/embeddinggemma-300m")

def embed_texts(texts)-> np.ndarray:
    vectors = model.encode(
        texts, normalize_embeddings=False
    ).astype('float32')
    # Normalize the vectors to unit length
    vectors = vectors / np.linalg.norm(vectors, axis=1, keepdims=True)
    
    return vectors

def build_index(texts: list[str]) -> faiss.IndexFlatIP:
    vectors = embed_texts(texts)
    dimension = vectors.shape[1]
    
    index = faiss.IndexFlatIP(dimension)#
    
    label_vectors = vectors.astype('float32')
    index.add(label_vectors)
    
    return index

Load in ontology labels

In [4]:
from rdflib.namespace import OWL, RDFS
import rdflib

g = rdflib.Graph()
g.parse("../../anthropogenic_ontology.ttl", format="turtle")

BASE = ("http://www.semanticweb.org/dbotteld/ontologies/2025/6/sound_ontology#")

labels = []

query = """
SELECT ?cls ?label
WHERE {
  ?cls a owl:Class .
  FILTER NOT EXISTS { ?sub rdfs:subClassOf ?cls . }
  OPTIONAL { ?cls rdfs:label ?label . }
}
"""

no_label_list = ["AudioSet"]

for row in g.query(query, initNs={"owl": OWL, "rdfs": RDFS, "base": BASE}):
    label = row.label if row.label else row.cls
    if str(label) not in no_label_list:  
      labels.append(str(label))

label_vectors = embed_texts(labels)
labels

['measurement vehicle noise',
 'shout and scream',
 'rooster',
 'amplified music and electronic playback',
 'laughter',
 'boat',
 'Acoustical Vehicle Alerting System',
 'tram',
 'explosion',
 'motorcycle',
 'bird call',
 'bus',
 'bell',
 'sawing, ginding',
 'dawn chorus',
 'life instrumental music, few instruments',
 'jackhammer, breaker',
 'noise policy field sound classification',
 'dog bark',
 'vehicle horn',
 'drone',
 'bird whistle',
 'roll',
 'footsteps',
 'crane, buldozer',
 'impact',
 'scrape',
 'reverse beeper',
 'drill',
 'insect chorus',
 'children playing',
 'alarm',
 'caugh',
 'spraying water',
 'buzzing bees and flies',
 'siren',
 'truck',
 'plane',
 'train',
 'vocal music',
 'horeshoes',
 'aeolian tones',
 'poultry',
 'car',
 'speech',
 'songbird song',
 'pile-driver',
 'helicopter',
 'chainsaw',
 'hammer',
 'machete',
 'motor',
 'siren',
 'tractor',
 'turbine',
 'whistling',
 'thunder',
 'flowing water',
 'rustling leaves',
 'microphone wind noise',
 'industry hum',
 'c

### Similarity functions

In [5]:
def build_similarity_dataframe(
    query_labels: list[str],
    dataset_labels: list[str],
    top_k: int = 3
) -> pd.DataFrame:
    """
    Build a DataFrame with similarity scores between ontology labels and dataset labels.
    
    Args:
        query_labels: List of ontology labels to query
        dataset_labels: List of dataset labels to build the index from
        top_k: Number of top similar labels to retrieve
    
    Returns:
        DataFrame with columns: ontology_label, dataset_labels, dataset_indices, similarity_scores
    """
    # Build index from dataset labels
    dataset_vectors = embed_texts(dataset_labels)
    dimension = dataset_vectors.shape[1]
    index = faiss.IndexFlatIP(dimension)
    index.add(dataset_vectors)
    
    mapping_data = []
    
    for ontology_label in query_labels:
        # Embed and normalize the ontology label
        query_vector = embed_texts([ontology_label])
        
        # Search for top-k similar dataset labels
        scores, indices = index.search(query_vector, top_k)
        
        # Get matched labels and round scores
        matched_labels = [dataset_labels[i] for i in indices[0]]
        rounded_scores = [float(round(float(score), 4)) for score in scores[0]]
        
        mapping_data.append({
            'ontology_label': ontology_label,
            'dataset_labels': matched_labels,
            'dataset_indices': indices.tolist()[0],
            'similarity_scores': rounded_scores
        })
    
    return pd.DataFrame(mapping_data)

## Class of interest 1: Plane

### audioset

In [6]:
mid_to_label_map = pd.read_csv(
    "../../data/metadata/audioset/mid_to_display_name.tsv",
    sep="\t",
    header=None,
    names=["mid", "display_name"],
)
mid_to_label_dict = dict(zip(mid_to_label_map["mid"], mid_to_label_map["display_name"]))

mid_to_label_map.head()

Unnamed: 0,mid,display_name
0,/g/11b630rrvh,Kettle whistle
1,/g/122z_qxw,Firecracker
2,/m/01280g,Wild animals
3,/m/012f08,Motor vehicle (road)
4,/m/012n7d,Ambulance (siren)


In [7]:
audioset_labels = pd.read_csv(
    "../../data/metadata/audioset/audioset_train_strong.tsv", sep="\t"
)
audioset_labels['label'] = audioset_labels['label'].apply(lambda x: mid_to_label_dict[x])
audioset_labels.head()
audioset_labels

Unnamed: 0,segment_id,start_time_seconds,end_time_seconds,label
0,b0RFKhbpFJA_30000,0.000,10.000,Wind
1,b0RFKhbpFJA_30000,4.753,5.720,"Male speech, man speaking"
2,b0RFKhbpFJA_30000,0.000,10.000,Buzz
3,b0RFKhbpFJA_30000,6.899,7.010,Tick
4,b0RFKhbpFJA_30000,8.534,9.156,Wind noise (microphone)
...,...,...,...,...
934816,cq-vfngNXMc_70000,7.836,8.015,Tick
934817,cq-vfngNXMc_70000,8.226,8.511,Generic impact sounds
934818,cq-vfngNXMc_70000,8.503,8.868,"Male speech, man speaking"
934819,cq-vfngNXMc_70000,9.217,9.624,Generic impact sounds


In [8]:
audioset_label_list = audioset_labels['label'].unique().tolist()

similarity_df = build_similarity_dataframe(
    query_labels=labels,
    dataset_labels=audioset_label_list,
    top_k=3
)
similarity_df

Unnamed: 0,ontology_label,dataset_labels,dataset_indices,similarity_scores
0,measurement vehicle noise,"[Traffic noise, roadway noise, Environmental n...","[131, 39, 80]","[0.6874, 0.6822, 0.6782]"
1,shout and scream,"[Screaming, Shout, Yell]","[138, 15, 406]","[0.8618, 0.8449, 0.8105]"
2,rooster,"[Chicken, rooster, Cluck, Rattle]","[165, 149, 182]","[0.7468, 0.741, 0.7265]"
3,amplified music and electronic playback,"[Music, Electronic tuner, Sound reproduction]","[14, 361, 438]","[0.6163, 0.5818, 0.5673]"
4,laughter,"[Laughter, Giggle, Belly laugh]","[27, 49, 29]","[0.9721, 0.9079, 0.8285]"
...,...,...,...,...
65,rain,"[Rain, Rain on surface, Raindrop]","[70, 237, 371]","[0.9536, 0.785, 0.7682]"
66,road traffic,"[Traffic noise, roadway noise, Motor vehicle (...","[131, 57, 170]","[0.7944, 0.7838, 0.6875]"
67,breaking waves,"[Waves, surf, Breaking, Ocean]","[198, 71, 76]","[0.8071, 0.7745, 0.7326]"
68,falling water,"[Waterfall, Water, Raindrop]","[206, 65, 371]","[0.8236, 0.6955, 0.6695]"


In [9]:
#list all audioset labels found for similarity_df
all_found_labels = set()
for label in similarity_df['dataset_labels']:
    all_found_labels.update(label)
list(all_found_labels)

['Environmental noise',
 'Motor vehicle (road)',
 'Bird',
 'Boat, Water vehicle',
 'Bee, wasp, etc.',
 'Pulleys',
 'Whistling',
 'Human sounds',
 'Waves, surf',
 'Fowl',
 'Boom',
 'Water',
 'Pump (liquid)',
 'Whistle',
 'Hum',
 'Jackhammer',
 'Power tool',
 'Speech',
 'Hammer',
 'Cluck',
 'Dog',
 'Power saw, circular saw, table saw',
 'Chant',
 'Electronic tuner',
 'Car',
 'Cheering',
 'Traffic noise, roadway noise',
 'Chicken, rooster',
 'Yodeling',
 'Ship',
 'Ocean',
 'Noise',
 'Growling',
 'Breaking',
 'Vehicle horn, car horn, honking, toot',
 'Sawing',
 'Vehicle',
 'Engine',
 'Siren',
 'Crow',
 'Choir',
 'Chainsaw',
 'Child speech, kid speaking',
 'Ringing tone, ringback tone',
 'Alarm',
 'Lawn mower',
 'Bicycle, tricycle',
 'Truck',
 'Fly, housefly',
 'Knife',
 'Chain',
 'Buzz',
 'Walk, footsteps',
 'Raindrop',
 'Alarm clock',
 'Helicopter',
 'Stream, river',
 'Screaming',
 'Church bell',
 'Rattle',
 'Doorbell',
 'Chop',
 'Rail transport',
 'Race car, auto racing',
 'Water tap, fa

#### count all labels similar to "Plane"

Labels most similar to "plane":

In [10]:
planes = similarity_df[similarity_df['ontology_label'] == 'plane']['dataset_labels'].values[0]
planes


['Aircraft', 'Fixed-wing aircraft, airplane', 'Aircraft engine']

In [11]:
strong_audioset_labels_train = pd.read_csv("../../data/metadata/audioset/audioset_train_strong.tsv", sep="\t",header=None,names=["filename", "start_time", "end_time", "label","split", "caption"])
strong_audioset_labels_eval = pd.read_csv("../../data/metadata/audioset/audioset_eval_strong.tsv", sep="\t",header=None,names=["filename", "start_time", "end_time", "label", "split", "caption"])

# Merge the two dataframes into one, keeping source split info and resetting the index
strong_audioset_labels = pd.concat(
    [
        strong_audioset_labels_train.assign(split="train"),
        strong_audioset_labels_eval.assign(split="eval"),
    ],
    ignore_index=True,
)

# quick check
strong_audioset_labels.shape, strong_audioset_labels.head()


  strong_audioset_labels_train = pd.read_csv("../../data/metadata/audioset/audioset_train_strong.tsv", sep="\t",header=None,names=["filename", "start_time", "end_time", "label","split", "caption"])
  strong_audioset_labels_eval = pd.read_csv("../../data/metadata/audioset/audioset_eval_strong.tsv", sep="\t",header=None,names=["filename", "start_time", "end_time", "label", "split", "caption"])


((1074361, 6),
             filename          start_time          end_time       label  split  \
 0         segment_id  start_time_seconds  end_time_seconds       label  train   
 1  b0RFKhbpFJA_30000               0.000            10.000  /m/03m9d0z  train   
 2  b0RFKhbpFJA_30000               4.753             5.720   /m/05zppz  train   
 3  b0RFKhbpFJA_30000               0.000            10.000  /m/07pjwq1  train   
 4  b0RFKhbpFJA_30000               6.899             7.010  /m/07qjznt  train   
 
    caption  
 0      NaN  
 1      NaN  
 2      NaN  
 3      NaN  
 4      NaN  )

In [12]:
# replace the MID codes in 'label' with display names, keeping original MID if no mapping exists
strong_audioset_labels['label'] = strong_audioset_labels['label'].map(mid_to_label_dict).fillna(strong_audioset_labels['label'])
strong_audioset_labels.head()


Unnamed: 0,filename,start_time,end_time,label,split,caption
0,segment_id,start_time_seconds,end_time_seconds,label,train,
1,b0RFKhbpFJA_30000,0.000,10.000,Wind,train,
2,b0RFKhbpFJA_30000,4.753,5.720,"Male speech, man speaking",train,
3,b0RFKhbpFJA_30000,0.000,10.000,Buzz,train,
4,b0RFKhbpFJA_30000,6.899,7.010,Tick,train,


In [13]:
# Count exact matches for each plane-related label
plane_counts = strong_audioset_labels['label'].isin(planes).value_counts().to_dict()
print(plane_counts)

# Also show counts broken down by split (train / eval)
strong_audioset_labels[strong_audioset_labels['label'].isin(planes)].groupby(['label', 'split']).size().unstack(fill_value=0)

{False: 1073249, True: 1112}


split,eval,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Aircraft,84,355
Aircraft engine,87,290
"Fixed-wing aircraft, airplane",53,243


### urban_sound_8k

(no airplane sounds in this dataset, contains only 10 classes)

### Esc-50

Load in labels from ESC-50

In [14]:
esc50_labels = pd.read_csv("../../data/metadata/esc50/esc50.csv")
esc50_labels.head()

Unnamed: 0,filename,fold,target,category,esc10,src_file,take
0,1-100032-A-0.wav,1,0,dog,True,100032,A
1,1-100038-A-14.wav,1,14,chirping_birds,False,100038,A
2,1-100210-A-36.wav,1,36,vacuum_cleaner,False,100210,A
3,1-100210-B-36.wav,1,36,vacuum_cleaner,False,100210,B
4,1-101296-A-19.wav,1,19,thunderstorm,False,101296,A


In [15]:
unique_esc50_labels = esc50_labels['category'].unique().tolist()
unique_esc50_labels

['dog',
 'chirping_birds',
 'vacuum_cleaner',
 'thunderstorm',
 'door_wood_knock',
 'can_opening',
 'crow',
 'clapping',
 'fireworks',
 'chainsaw',
 'airplane',
 'mouse_click',
 'pouring_water',
 'train',
 'sheep',
 'water_drops',
 'church_bells',
 'clock_alarm',
 'keyboard_typing',
 'wind',
 'footsteps',
 'frog',
 'cow',
 'brushing_teeth',
 'car_horn',
 'crackling_fire',
 'helicopter',
 'drinking_sipping',
 'rain',
 'insects',
 'laughing',
 'hen',
 'engine',
 'breathing',
 'crying_baby',
 'hand_saw',
 'coughing',
 'glass_breaking',
 'snoring',
 'toilet_flush',
 'pig',
 'washing_machine',
 'clock_tick',
 'sneezing',
 'rooster',
 'sea_waves',
 'siren',
 'cat',
 'door_wood_creaks',
 'crickets']

In [16]:
esc_similarity_df = build_similarity_dataframe(
    query_labels=labels,
    dataset_labels=unique_esc50_labels,
    top_k=3
)
esc_similarity_df

KeyboardInterrupt: 

Now we have all the information needed to count the number of samples in ESC-50 similar to "plane"

In [None]:
esc_similarity_df[esc_similarity_df['ontology_label'] == 'plane']['dataset_labels'].values[0]

Only "airplane" is similar enough to "plane" since we already have helicopter and engine in our ontology labels.

In [None]:
# count 'airplane' occurrences per fold
esc50_airplane_counts = esc50_labels[esc50_labels['category'] == 'airplane'].groupby('fold').size().sort_index()
esc50_airplane_counts

Unique airplane samples in ESC-50:

In [None]:
# All unique airplane recordings in ESC-50
airplane_rows = esc50_labels[esc50_labels['category'] == 'airplane']
unique_airplane_filenames = airplane_rows['filename'].unique()
len(unique_airplane_filenames)

In [None]:
# get all overlapping airplane samples
esc50_labels[esc50_labels["category"] == "airplane"].shape[0] - len(unique_airplane_filenames)

### FSD50K

In [None]:
fsd_vocab_labels = pd.read_csv("../../data/metadata/fsd50k_labels/vocabulary.csv",header =None,names =['label','mid'])
fsd_vocab_labels.head()

In [None]:
fsd_similarity_df = build_similarity_dataframe(
    query_labels=labels,
    dataset_labels=fsd_vocab_labels['label'].tolist(),
    top_k=3
)
fsd_similarity_df

In [None]:
plane_labels = fsd_similarity_df[fsd_similarity_df['ontology_label'] == 'plane']['dataset_labels'].values[0][0:2]
plane_labels

In [None]:
# concat train and eval labels
fsd_labels_train = pd.read_csv("../../data/metadata/fsd50k_labels/dev.csv")
fsd_labels_dev = pd.read_csv("../../data/metadata/fsd50k_labels/eval.csv")
fsd_labels = pd.concat([fsd_labels_train.assign(split="train"), fsd_labels_dev.assign(split="eval")], ignore_index=True)
fsd_labels.head()

In [None]:
# count 'plane' occurrences in FSD50K by split when any label matches plane_labels
plane_set = set(plane_labels)
fsd_labels['is_plane'] = fsd_labels['labels'].str.split(',').apply(lambda labs: any(l in plane_set for l in labs))

plane_counts_fsd = fsd_labels[fsd_labels['is_plane']].groupby('split').size().sort_index()
plane_counts_fsd

### Captdure (captioned sounds dataset)

(No planes, single sound sources mostly indoor sounds)

### Sounddesc (captioned sounds dataset bbc sound effects)

In [None]:
sounddescs_categories = pd.read_pickle("../../data/metadata/sounddesc/sounddescs_categories.pkl")
sounddescs_categories

Captions for later, specifically for clap embedding models

In [None]:
sounddescs_descriptions = pd.read_pickle("../../data/metadata/sounddesc/sounddescs_descriptions.pkl")
sounddescs_descriptions

In [None]:
# list all unique categories found in values of the dictionary
unique_sounddesc = list(set([item for sublist in sounddescs_categories.values() for item in sublist]))


sounddescs_similarity_df = build_similarity_dataframe(
    query_labels=labels,
    dataset_labels=unique_sounddesc,
    top_k=3
)
sounddescs_similarity_df

In [None]:
# similarity for 'plane'
sounddescs_similarity_df[sounddescs_similarity_df['ontology_label'] == 'plane']['dataset_labels'].values[0]
# only first one is really relevant: "Aircraft"

Count number of samples similar to "Aircraft" in Sounddescs

Load in the sounddesc cleaned/grouped splits

In [None]:
grouped_sounddesc_split = pd.DataFrame()
with open("./metadata/splits_sounddesc/group_filtered_split01/test_list.txt") as f:
    test_list = f.read().splitlines()
    grouped_sounddesc_split = pd.DataFrame({
        'filename': test_list,
        'split': 'test'
    })

with open("./metadata/splits_sounddesc/group_filtered_split01/train_list.txt") as f:
    train_list = f.read().splitlines()
    train_df = pd.DataFrame({
        'filename': train_list,
        'split': 'train'
    })
    grouped_sounddesc_split = pd.concat([grouped_sounddesc_split, train_df], ignore_index=True)

with open("./metadata/splits_sounddesc/group_filtered_split01/val_list.txt") as f:
    val_list = f.read().splitlines()
    val_df = pd.DataFrame({
        'filename': val_list,
        'split': 'val'
    })
    grouped_sounddesc_split = pd.concat([grouped_sounddesc_split, val_df], ignore_index=True)

grouped_sounddesc_split.head()

attach the categories to the filenames in the proper grouped split

In [None]:
grouped_sounddesc_split['categories'] = grouped_sounddesc_split['filename'].map(sounddescs_categories)
grouped_sounddesc_split.head()

In [None]:
# Count the number of samples that has "Aircraft" as their category grouped by split

aircraft_counts = grouped_sounddesc_split.explode('categories')
aircraft_counts = aircraft_counts[aircraft_counts['categories'] == 'Aircraft']
aircraft_counts = aircraft_counts.groupby('split').size().sort_index()
aircraft_counts



In [None]:
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

# Collect data from each dataset with consistent train/test splits

# AudioSet - combine 'eval' into 'test'
audioset_plane_df = strong_audioset_labels[strong_audioset_labels["label"].isin(planes)]
audioset_split_mapping = {'train': 'train', 'eval': 'test'}
audioset_plane_df['unified_split'] = audioset_plane_df['split'].map(audioset_split_mapping)
audioset_counts = audioset_plane_df.groupby("unified_split").size().to_dict()

# ESC-50 - folds 1-4 as train, fold 5 as test
esc50_airplane_df = esc50_labels[esc50_labels['category'] == 'airplane'].copy()
esc50_airplane_df['unified_split'] = esc50_airplane_df['fold'].apply(lambda x: 'train' if x <= 4 else 'test')
esc50_counts = esc50_airplane_df.groupby('unified_split').size().to_dict()

# FSD50K - 'eval' becomes 'test', 'train' stays 'train'
fsd_plane_df = fsd_labels[fsd_labels['is_plane']].copy()
fsd_split_mapping = {'train': 'train', 'eval': 'test'}
fsd_plane_df['unified_split'] = fsd_plane_df['split'].map(fsd_split_mapping)
fsd_plane_counts = fsd_plane_df.groupby('unified_split').size().to_dict()

# SoundDescs - combine 'val' with 'train', 'test' stays 'test'
sounddesc_aircraft_df = grouped_sounddesc_split.explode('categories')
sounddesc_aircraft_df = sounddesc_aircraft_df[sounddesc_aircraft_df['categories'] == 'Aircraft'].copy()
sounddesc_split_mapping = {'train': 'train', 'val': 'train', 'test': 'test'}
sounddesc_aircraft_df['unified_split'] = sounddesc_aircraft_df['split'].map(sounddesc_split_mapping)
sounddesc_counts = sounddesc_aircraft_df.groupby('unified_split').size().to_dict()

# Create individual bar charts for each dataset
fig_audioset = px.bar(
    x=list(audioset_counts.keys()),
    y=list(audioset_counts.values()),
    title="AudioSet - Airplane Samples by Split",
    labels={"x": "Split", "y": "Count"},
    text=list(audioset_counts.values()),
)
fig_audioset.update_traces(textposition="inside", textangle=0)
fig_audioset.update_layout(width=600, height=500)
fig_audioset.show()

# ESC-50
fig_esc50 = px.bar(
    x=list(esc50_counts.keys()),
    y=list(esc50_counts.values()),
    title="ESC-50 - Airplane Samples by Split",
    labels={"x": "Split", "y": "Count"},
    text=list(esc50_counts.values()),
)
fig_esc50.update_layout(width=600, height=500)
fig_esc50.update_traces(textposition="inside", textangle=0)
fig_esc50.show()

# FSD50K
fig_fsd = px.bar(
    x=list(fsd_plane_counts.keys()),
    y=list(fsd_plane_counts.values()),
    title="FSD50K - Airplane Samples by Split",
    labels={"x": "Split", "y": "Count"},
    text=list(fsd_plane_counts.values()),
)
fig_fsd.update_traces(textposition="inside", textangle=0)
fig_fsd.update_layout(width=600, height=500)
fig_fsd.show()

# SoundDescs
fig_sounddesc = px.bar(
    x=list(sounddesc_counts.keys()),
    y=list(sounddesc_counts.values()),
    title="SoundDescs - Aircraft Samples by Split",
    labels={"x": "Split", "y": "Count"},
    text=list(sounddesc_counts.values()),
)
fig_sounddesc.update_traces(textposition="inside", textangle=0)
fig_sounddesc.update_layout(width=600, height=500)
fig_sounddesc.show()

# Combined comparison - Total samples per dataset
total_samples = {
    "AudioSet": sum(audioset_counts.values()),
    "ESC-50": sum(esc50_counts.values()),
    "FSD50K": sum(fsd_plane_counts.values()),
    "SoundDescs": sum(sounddesc_counts.values()),
}

fig_total = px.bar(
    x=list(total_samples.keys()),
    y=list(total_samples.values()),
    title="Total Airplane/Aircraft Samples Across All Datasets",
    labels={"x": "Dataset", "y": "Total Count"},
    text=list(total_samples.values()),
    color=list(total_samples.keys()),
)
fig_total.update_traces(textposition="inside", textangle=0)
fig_total.update_layout(showlegend=False, width=600, height=500)
fig_total.show()

# Grand total
grand_total = sum(total_samples.values())
print(f"\nGrand Total Airplane Samples: {grand_total}")

# Create a stacked bar chart showing splits across datasets
split_data = []
for dataset, counts_dict in [
    ("AudioSet", audioset_counts),
    ("ESC-50", esc50_counts),
    ("FSD50K", fsd_plane_counts),
    ("SoundDescs", sounddesc_counts),
]:
    for split, count in counts_dict.items():
        split_data.append({"Dataset": dataset, "Split": split, "Count": count})

split_df = pd.DataFrame(split_data)

fig_stacked = px.bar(
    split_df,
    x="Dataset",
    y="Count",
    color="Split",
    title="Airplane Samples Distribution by Dataset and Split",
    barmode="stack",
    text="Count",
)
fig_stacked.update_traces(textposition="inside", textangle=0)
fig_stacked.update_layout(width=500, height=600)
fig_stacked.show()

In [None]:
# Create a pie chart showing overall train/test distribution across all datasets
overall_train = sum([counts_dict.get('train', 0) for counts_dict in [audioset_counts, esc50_counts, fsd_plane_counts, sounddesc_counts]])
overall_test = sum([counts_dict.get('test', 0) for counts_dict in [audioset_counts, esc50_counts, fsd_plane_counts, sounddesc_counts]])

fig_pie = go.Figure(data=[go.Pie(
    labels=['Train', 'Test'],
    values=[overall_train, overall_test],
    text=[overall_train, overall_test],
    textposition='inside',
    textinfo='label+value+percent',
    marker=dict(colors=['#636EFA', '#EF553B'])
)])

fig_pie.update_layout(
    title="Overall Train/Test Distribution Across All Datasets",
    width=600,
    height=500
)
fig_pie.show()

print(f"\nTotal Train Samples: {overall_train}")
print(f"Total Test Samples: {overall_test}")
print(f"Train/Test Ratio: {overall_train/overall_test:.2f}")