# Label counting to determine dataset size

## Label mapping from ontology to datasets


In [2]:
import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

2025-11-25 22:34:08.233918: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Setting up embeddings

In [3]:
model = SentenceTransformer("google/embeddinggemma-300m")

def embed_texts(texts)-> np.ndarray:
    vectors = model.encode(
        texts, normalize_embeddings=False
    ).astype('float32')
    # Normalize the vectors to unit length
    vectors = vectors / np.linalg.norm(vectors, axis=1, keepdims=True)
    
    return vectors

def build_index(texts: list[str]) -> faiss.IndexFlatIP:
    vectors = embed_texts(texts)
    dimension = vectors.shape[1]
    
    index = faiss.IndexFlatIP(dimension)#
    
    label_vectors = vectors.astype('float32')
    index.add(label_vectors)
    
    return index

Load in ontology labels

In [4]:
from rdflib.namespace import OWL, RDFS
import rdflib

g = rdflib.Graph()
g.parse("../../anthropogenic_ontology.ttl", format="turtle")

BASE = ("http://www.semanticweb.org/dbotteld/ontologies/2025/6/sound_ontology#")

labels = []

query = """
SELECT ?cls ?label
WHERE {
  ?cls a owl:Class .
  FILTER NOT EXISTS { ?sub rdfs:subClassOf ?cls . }
  OPTIONAL { ?cls rdfs:label ?label . }
}
"""

no_label_list = ["AudioSet"]

for row in g.query(query, initNs={"owl": OWL, "rdfs": RDFS, "base": BASE}):
    label = row.label if row.label else row.cls
    if str(label) not in no_label_list:  
      labels.append(str(label))

label_vectors = embed_texts(labels)
labels

['measurement vehicle noise',
 'shout and scream',
 'rooster',
 'amplified music and electronic playback',
 'laughter',
 'boat',
 'Acoustical Vehicle Alerting System',
 'tram',
 'explosion',
 'motorcycle',
 'bird call',
 'bus',
 'bell',
 'sawing, ginding',
 'dawn chorus',
 'life instrumental music, few instruments',
 'jackhammer, breaker',
 'noise policy field sound classification',
 'dog bark',
 'vehicle horn',
 'drone',
 'bird whistle',
 'roll',
 'footsteps',
 'crane, buldozer',
 'impact',
 'scrape',
 'reverse beeper',
 'drill',
 'insect chorus',
 'children playing',
 'alarm',
 'caugh',
 'spraying water',
 'buzzing bees and flies',
 'siren',
 'truck',
 'plane',
 'train',
 'vocal music',
 'horeshoes',
 'aeolian tones',
 'poultry',
 'car',
 'speech',
 'songbird song',
 'pile-driver',
 'helicopter',
 'chainsaw',
 'hammer',
 'machete',
 'motor',
 'siren',
 'tractor',
 'turbine',
 'whistling',
 'thunder',
 'flowing water',
 'rustling leaves',
 'microphone wind noise',
 'industry hum',
 'c

### Similarity functions

In [5]:
def build_similarity_dataframe(
    query_labels: list[str],
    dataset_labels: list[str],
    top_k: int = 3
) -> pd.DataFrame:
    """
    Build a DataFrame with similarity scores between ontology labels and dataset labels.
    
    Args:
        query_labels: List of ontology labels to query
        dataset_labels: List of dataset labels to build the index from
        top_k: Number of top similar labels to retrieve
    
    Returns:
        DataFrame with columns: ontology_label, dataset_labels, dataset_indices, similarity_scores
    """
    # Build index from dataset labels
    dataset_vectors = embed_texts(dataset_labels)
    dimension = dataset_vectors.shape[1]
    index = faiss.IndexFlatIP(dimension)
    index.add(dataset_vectors)
    
    mapping_data = []
    
    for ontology_label in query_labels:
        # Embed and normalize the ontology label
        query_vector = embed_texts([ontology_label])
        
        # Search for top-k similar dataset labels
        scores, indices = index.search(query_vector, top_k)
        
        # Get matched labels and round scores
        matched_labels = [dataset_labels[i] for i in indices[0]]
        rounded_scores = [float(round(float(score), 4)) for score in scores[0]]
        
        mapping_data.append({
            'ontology_label': ontology_label,
            'dataset_labels': matched_labels,
            'dataset_indices': indices.tolist()[0],
            'similarity_scores': rounded_scores
        })
    
    return pd.DataFrame(mapping_data)

## Class of interest 1: Plane

### audioset

In [6]:
audioset_labels = pd.read_csv("./metadata/class_labels_audioset.csv")
audioset_label_list = audioset_labels['display_name'].tolist()

similarity_df = build_similarity_dataframe(
    query_labels=labels,
    dataset_labels=audioset_label_list,
    top_k=3
)
similarity_df

Unnamed: 0,ontology_label,dataset_labels,dataset_indices,similarity_scores
0,measurement vehicle noise,"[Traffic noise, roadway noise, Environmental n...","[327, 514, 513]","[0.6874, 0.6822, 0.6782]"
1,shout and scream,"[Screaming, Shout, Yell]","[14, 8, 11]","[0.8618, 0.8449, 0.8105]"
2,rooster,"[Chicken, rooster, Cluck, Rattle]","[99, 100, 135]","[0.7468, 0.741, 0.7265]"
3,amplified music and electronic playback,"[Electronic music, Electronic dance music, Mus...","[239, 245, 138]","[0.6968, 0.6422, 0.6175]"
4,laughter,"[Laughter, Giggle, Belly laugh]","[16, 18, 20]","[0.9721, 0.9079, 0.8285]"
...,...,...,...,...
65,rain,"[Rain, Rain on surface, Raindrop]","[289, 291, 290]","[0.9536, 0.785, 0.7682]"
66,road traffic,"[Traffic noise, roadway noise, Motor vehicle (...","[327, 306, 300]","[0.7944, 0.7838, 0.6875]"
67,breaking waves,"[Waves, surf, Breaking, Ocean]","[295, 470, 294]","[0.8071, 0.7745, 0.7326]"
68,falling water,"[Waterfall, Water, Raindrop]","[293, 288, 290]","[0.8236, 0.6955, 0.6695]"


#### count all labels similar to "Plane"

Labels most similar to "plane":

In [7]:
planes = similarity_df[similarity_df['ontology_label'] == 'plane']['dataset_labels'].values[0]
planes

['Aircraft', 'Fixed-wing aircraft, airplane', 'Aircraft engine']

In [8]:
strong_audioset_labels_train = pd.read_csv("./metadata/audioset_train_strong.tsv", sep="\t")
strong_audioset_labels_eval = pd.read_csv("./metadata/audioset_eval_strong.tsv", sep="\t")

# Merge the two dataframes into one, keeping source split info and resetting the index
strong_audioset_labels = pd.concat(
    [
        strong_audioset_labels_train.assign(split="train"),
        strong_audioset_labels_eval.assign(split="eval"),
    ],
    ignore_index=True
)

# quick check
strong_audioset_labels.shape, strong_audioset_labels.head()



((1074359, 5),
           segment_id  start_time_seconds  end_time_seconds       label  split
 0  b0RFKhbpFJA_30000               0.000            10.000  /m/03m9d0z  train
 1  b0RFKhbpFJA_30000               4.753             5.720   /m/05zppz  train
 2  b0RFKhbpFJA_30000               0.000            10.000  /m/07pjwq1  train
 3  b0RFKhbpFJA_30000               6.899             7.010  /m/07qjznt  train
 4  b0RFKhbpFJA_30000               8.534             9.156  /t/dd00092  train)

In [9]:
mid_to_label_map = pd.read_csv("./metadata/mid_to_display_name.tsv", sep="\t",header=None, names=['mid','display_name'])
mid_to_label_dict = dict(zip(mid_to_label_map['mid'], mid_to_label_map['display_name']))

mid_to_label_map.head()

Unnamed: 0,mid,display_name
0,/g/11b630rrvh,Kettle whistle
1,/g/122z_qxw,Firecracker
2,/m/01280g,Wild animals
3,/m/012f08,Motor vehicle (road)
4,/m/012n7d,Ambulance (siren)


In [10]:
# replace the MID codes in 'label' with display names, keeping original MID if no mapping exists
strong_audioset_labels['label'] = strong_audioset_labels['label'].map(mid_to_label_dict).fillna(strong_audioset_labels['label'])
strong_audioset_labels.head()


Unnamed: 0,segment_id,start_time_seconds,end_time_seconds,label,split
0,b0RFKhbpFJA_30000,0.0,10.0,Wind,train
1,b0RFKhbpFJA_30000,4.753,5.72,"Male speech, man speaking",train
2,b0RFKhbpFJA_30000,0.0,10.0,Buzz,train
3,b0RFKhbpFJA_30000,6.899,7.01,Tick,train
4,b0RFKhbpFJA_30000,8.534,9.156,Wind noise (microphone),train


In [11]:
# Count exact matches for each plane-related label
plane_counts = strong_audioset_labels['label'].isin(planes).value_counts().to_dict()
print(plane_counts)

# Also show counts broken down by split (train / eval)
strong_audioset_labels[strong_audioset_labels['label'].isin(planes)].groupby(['label', 'split']).size().unstack(fill_value=0)

{False: 1073247, True: 1112}


split,eval,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Aircraft,84,355
Aircraft engine,87,290
"Fixed-wing aircraft, airplane",53,243


### urban_sound_8k

(no airplane sounds in this dataset, contains only 10 classes)

### Esc-50

Load in labels from ESC-50

In [13]:
esc50_labels = pd.read_csv("./metadata/esc50.csv")
esc50_labels.head()

Unnamed: 0,filename,fold,target,category,esc10,src_file,take
0,1-100032-A-0.wav,1,0,dog,True,100032,A
1,1-100038-A-14.wav,1,14,chirping_birds,False,100038,A
2,1-100210-A-36.wav,1,36,vacuum_cleaner,False,100210,A
3,1-100210-B-36.wav,1,36,vacuum_cleaner,False,100210,B
4,1-101296-A-19.wav,1,19,thunderstorm,False,101296,A


In [14]:
unique_esc50_labels = esc50_labels['category'].unique().tolist()
unique_esc50_labels

['dog',
 'chirping_birds',
 'vacuum_cleaner',
 'thunderstorm',
 'door_wood_knock',
 'can_opening',
 'crow',
 'clapping',
 'fireworks',
 'chainsaw',
 'airplane',
 'mouse_click',
 'pouring_water',
 'train',
 'sheep',
 'water_drops',
 'church_bells',
 'clock_alarm',
 'keyboard_typing',
 'wind',
 'footsteps',
 'frog',
 'cow',
 'brushing_teeth',
 'car_horn',
 'crackling_fire',
 'helicopter',
 'drinking_sipping',
 'rain',
 'insects',
 'laughing',
 'hen',
 'engine',
 'breathing',
 'crying_baby',
 'hand_saw',
 'coughing',
 'glass_breaking',
 'snoring',
 'toilet_flush',
 'pig',
 'washing_machine',
 'clock_tick',
 'sneezing',
 'rooster',
 'sea_waves',
 'siren',
 'cat',
 'door_wood_creaks',
 'crickets']

In [15]:
esc_similarity_df = build_similarity_dataframe(
    query_labels=labels,
    dataset_labels=unique_esc50_labels,
    top_k=3
)
esc_similarity_df

Unnamed: 0,ontology_label,dataset_labels,dataset_indices,similarity_scores
0,measurement vehicle noise,"[snoring, engine, wind]","[38, 32, 19]","[0.5315, 0.522, 0.4759]"
1,shout and scream,"[chainsaw, crow, siren]","[9, 6, 46]","[0.6372, 0.6114, 0.6002]"
2,rooster,"[rooster, cow, hen]","[44, 22, 31]","[1.0, 0.7184, 0.7066]"
3,amplified music and electronic playback,"[clapping, engine, airplane]","[7, 32, 10]","[0.4623, 0.4399, 0.4388]"
4,laughter,"[laughing, clapping, hen]","[30, 7, 31]","[0.9681, 0.655, 0.5979]"
...,...,...,...,...
65,rain,"[rain, thunderstorm, wind]","[28, 3, 19]","[1.0, 0.7473, 0.6108]"
66,road traffic,"[train, helicopter, engine]","[13, 26, 32]","[0.582, 0.5641, 0.5533]"
67,breaking waves,"[sea_waves, glass_breaking, wind]","[45, 37, 19]","[0.7164, 0.685, 0.6777]"
68,falling water,"[water_drops, pouring_water, rain]","[15, 12, 28]","[0.7041, 0.6545, 0.6451]"


Now we have all the information needed to count the number of samples in ESC-50 similar to "plane"

In [16]:
esc_similarity_df[esc_similarity_df['ontology_label'] == 'plane']['dataset_labels'].values[0]

['airplane', 'helicopter', 'engine']

Only "airplane" is similar enough to "plane" since we already have helicopter and engine in our ontology labels.

In [17]:
# count 'airplane' occurrences per fold
esc50_airplane_counts = esc50_labels[esc50_labels['category'] == 'airplane'].groupby('fold').size().sort_index()
esc50_airplane_counts

fold
1    8
2    8
3    8
4    8
5    8
dtype: int64

Unique airplane samples in ESC-50:

In [18]:
# All unique airplane recordings in ESC-50
airplane_rows = esc50_labels[esc50_labels['category'] == 'airplane']
unique_airplane_filenames = airplane_rows['filename'].unique()
len(unique_airplane_filenames)

40

In [19]:
# get all overlapping airplane samples
esc50_labels[esc50_labels["category"] == "airplane"].shape[0] - len(unique_airplane_filenames)

0

### FSD50K

In [20]:
fsd_vocab_labels = pd.read_csv("./metadata/fsd50k_labels/vocabulary.csv",header =None,names =['label','mid'])
fsd_vocab_labels.head()

Unnamed: 0,label,mid
0,Accelerating_and_revving_and_vroom,/m/07q2z82
1,Accordion,/m/0mkg
2,Acoustic_guitar,/m/042v_gx
3,Aircraft,/m/0k5j
4,Alarm,/m/07pp_mv


In [21]:
fsd_similarity_df = build_similarity_dataframe(
    query_labels=labels,
    dataset_labels=fsd_vocab_labels['label'].tolist(),
    top_k=3
)
fsd_similarity_df

Unnamed: 0,ontology_label,dataset_labels,dataset_indices,similarity_scores
0,measurement vehicle noise,"[Vehicle, Traffic_noise_and_roadway_noise, Car]","[184, 177, 26]","[0.668, 0.6627, 0.5796]"
1,shout and scream,"[Screaming, Shout, Yell]","[145, 148, 198]","[0.8618, 0.8449, 0.8105]"
2,rooster,"[Chicken_and_rooster, Rattle, Fowl]","[32, 137, 81]","[0.7439, 0.7265, 0.7237]"
3,amplified music and electronic playback,"[Music, Musical_instrument, Glockenspiel]","[120, 121, 87]","[0.6163, 0.5863, 0.5463]"
4,laughter,"[Laughter, Giggle, Clapping]","[107, 85, 39]","[0.9721, 0.9079, 0.6644]"
...,...,...,...,...
65,rain,"[Rain, Raindrop, Thunderstorm]","[134, 135, 172]","[0.9536, 0.7682, 0.7377]"
66,road traffic,"[Motor_vehicle_(road), Traffic_noise_and_roadw...","[118, 177, 184]","[0.7918, 0.7115, 0.6875]"
67,breaking waves,"[Ocean, Crack, Waves_and_surf]","[122, 46, 189]","[0.7326, 0.6859, 0.6808]"
68,falling water,"[Water, Raindrop, Rain]","[187, 135, 134]","[0.6955, 0.6695, 0.6427]"


In [22]:
plane_labels = fsd_similarity_df[fsd_similarity_df['ontology_label'] == 'plane']['dataset_labels'].values[0][0:2]
plane_labels

['Aircraft', 'Fixed-wing_aircraft_and_airplane']

In [23]:
# concat train and eval labels
fsd_labels_train = pd.read_csv("./metadata/fsd50k_labels/dev.csv")
fsd_labels_dev = pd.read_csv("./metadata/fsd50k_labels/eval.csv")
fsd_labels = pd.concat([fsd_labels_train.assign(split="train"), fsd_labels_dev.assign(split="eval")], ignore_index=True)
fsd_labels.head()

Unnamed: 0,fname,labels,mids,split
0,64760,"Electric_guitar,Guitar,Plucked_string_instrume...","/m/02sgy,/m/0342h,/m/0fx80y,/m/04szw,/m/04rlf",train
1,16399,"Electric_guitar,Guitar,Plucked_string_instrume...","/m/02sgy,/m/0342h,/m/0fx80y,/m/04szw,/m/04rlf",train
2,16401,"Electric_guitar,Guitar,Plucked_string_instrume...","/m/02sgy,/m/0342h,/m/0fx80y,/m/04szw,/m/04rlf",train
3,16402,"Electric_guitar,Guitar,Plucked_string_instrume...","/m/02sgy,/m/0342h,/m/0fx80y,/m/04szw,/m/04rlf",train
4,16404,"Electric_guitar,Guitar,Plucked_string_instrume...","/m/02sgy,/m/0342h,/m/0fx80y,/m/04szw,/m/04rlf",train


In [24]:
# count 'plane' occurrences in FSD50K by split when any label matches plane_labels
plane_set = set(plane_labels)
fsd_labels['is_plane'] = fsd_labels['labels'].str.split(',').apply(lambda labs: any(l in plane_set for l in labs))

plane_counts_fsd = fsd_labels[fsd_labels['is_plane']].groupby('split').size().sort_index()
plane_counts_fsd

split
eval      88
train    184
dtype: int64

### Captdure (captioned sounds dataset)

(No planes, single sound sources mostly indoor sounds)

### Sounddesc (captioned sounds dataset bbc sound effects)

In [25]:
sounddescs_categories = pd.read_pickle("./metadata/sounddescs_categories.pkl")
sounddescs_categories

defaultdict(list,
            {'nhu05030045': ['Nature'],
             '07033129': ['Daily Life'],
             '07018018': ['Crowds'],
             'nhu05062176': ['Nature'],
             'nhu05047152': ['Nature'],
             'nhu05090185': ['Nature'],
             'nhu05015061': ['Nature'],
             'nhu9322764': ['Nature'],
             'nhu05100198': ['Nature'],
             '07025147': ['Fire'],
             'nhu05074118': ['Nature'],
             '07000163': ['Aircraft', 'Military'],
             '07016099': ['Clocks'],
             'nhu05090011': ['Nature'],
             '07052050': ['Daily Life'],
             'nhu05003078': ['Daily Life'],
             'nhu05090066': ['Nature'],
             'nhu05015182': ['Nature'],
             '07052027': ['Daily Life'],
             'nhu05028149': ['Nature'],
             'nhu10347006': ['Nature'],
             '07039279': ['Transport'],
             '07000114': ['Aircraft', 'Military'],
             '07025130': ['Atmosphere', 'Dail

Captions for later, specifically for clap embedding models

In [26]:
sounddescs_descriptions = pd.read_pickle("./metadata/sounddescs_descriptions.pkl")
sounddescs_descriptions

defaultdict(None,
            {'nhu05051026': 'Greengrocer Cicada (Cyclochila Virens) - close-up stridulation, almost reaching full burst. Dog in distance. Recorded in captivity.',
             '07066104': "Timber & Wood - Rip saw, carpenters' workshop.",
             '07039376': 'Traffic actuality. (Recorded in Bristol.)',
             'nhu05010034': 'Lesser Black-backed Gull (Larus Fuscus) - close-up mew calls leading to long calls from several birds.',
             'nhu05063096': 'Common Bulbul (Pycnonotus Barbatus) - close-up-distant song and calls, with quail, corn bunting and nightingale.',
             '07027116': 'Household - Household: Sash Window, open and close.',
             '07055053': 'Power Stations - Atmosphere in oil pump house.',
             '07059034': 'Household 2 - Old fashioned mangle moved on stone floor.',
             'nhu05062171': 'Green Sandpiper (Tringa Ochropus) - Flight alarm calls close-up - distant. Flies mp. Water dripping medium close-up. Windy back

In [27]:
# list all unique categories found in values of the dictionary
unique_sounddesc = list(set([item for sublist in sounddescs_categories.values() for item in sublist]))


sounddescs_similarity_df = build_similarity_dataframe(
    query_labels=labels,
    dataset_labels=unique_sounddesc,
    top_k=3
)
sounddescs_similarity_df

Unnamed: 0,ontology_label,dataset_labels,dataset_indices,similarity_scores
0,measurement vehicle noise,"[Transport, Military, Aircraft]","[18, 11, 10]","[0.5402, 0.5148, 0.492]"
1,shout and scream,"[Fire, Applause, Birds]","[6, 20, 0]","[0.6099, 0.5868, 0.5623]"
2,rooster,"[Animals, Birds, Toys]","[21, 0, 23]","[0.6722, 0.6667, 0.6368]"
3,amplified music and electronic playback,"[Electronics, Toys, Applause]","[4, 23, 20]","[0.5595, 0.48, 0.4718]"
4,laughter,"[Comedy, Animals, Toys]","[22, 21, 23]","[0.8151, 0.571, 0.5597]"
...,...,...,...,...
65,rain,"[Atmosphere, Nature, Birds]","[19, 15, 0]","[0.5851, 0.5538, 0.5506]"
66,road traffic,"[Transport, Crowds, Daily Life]","[18, 7, 17]","[0.7373, 0.5953, 0.558]"
67,breaking waves,"[Destruction, Sport, Military]","[16, 9, 11]","[0.6166, 0.5819, 0.5768]"
68,falling water,"[Nature, Daily Life, Birds]","[15, 17, 0]","[0.5283, 0.5269, 0.5219]"


In [28]:
# similarity for 'plane'
sounddescs_similarity_df[sounddescs_similarity_df['ontology_label'] == 'plane']['dataset_labels'].values[0]
# only first one is really relevant: "Aircraft"

['Aircraft', 'Military', 'Toys']

Count number of samples similar to "Aircraft" in Sounddescs

Load in the sounddesc cleaned/grouped splits

In [29]:
grouped_sounddesc_split = pd.DataFrame()
with open("./metadata/splits_sounddesc/group_filtered_split01/test_list.txt") as f:
    test_list = f.read().splitlines()
    grouped_sounddesc_split = pd.DataFrame({
        'filename': test_list,
        'split': 'test'
    })

with open("./metadata/splits_sounddesc/group_filtered_split01/train_list.txt") as f:
    train_list = f.read().splitlines()
    train_df = pd.DataFrame({
        'filename': train_list,
        'split': 'train'
    })
    grouped_sounddesc_split = pd.concat([grouped_sounddesc_split, train_df], ignore_index=True)

with open("./metadata/splits_sounddesc/group_filtered_split01/val_list.txt") as f:
    val_list = f.read().splitlines()
    val_df = pd.DataFrame({
        'filename': val_list,
        'split': 'val'
    })
    grouped_sounddesc_split = pd.concat([grouped_sounddesc_split, val_df], ignore_index=True)

grouped_sounddesc_split.head()

Unnamed: 0,filename,split
0,07037581,test
1,07016013,test
2,07074143,test
3,nhu05054161,test
4,nhu10352128,test


attach the categories to the filenames in the proper grouped split

In [30]:
grouped_sounddesc_split['categories'] = grouped_sounddesc_split['filename'].map(sounddescs_categories)
grouped_sounddesc_split.head()

Unnamed: 0,filename,split,categories
0,07037581,test,[None]
1,07016013,test,[Clocks]
2,07074143,test,[Comedy]
3,nhu05054161,test,[Nature]
4,nhu10352128,test,[Nature]


In [31]:
# Count the number of samples that has "Aircraft" as their category grouped by split

aircraft_counts = grouped_sounddesc_split.explode('categories')
aircraft_counts = aircraft_counts[aircraft_counts['categories'] == 'Aircraft']
aircraft_counts = aircraft_counts.groupby('split').size().sort_index()
aircraft_counts



split
test      87
train    408
val      106
dtype: int64

In [32]:
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

# Collect data from each dataset with consistent train/test splits

# AudioSet - combine 'eval' into 'test'
audioset_plane_df = strong_audioset_labels[strong_audioset_labels["label"].isin(planes)]
audioset_split_mapping = {'train': 'train', 'eval': 'test'}
audioset_plane_df['unified_split'] = audioset_plane_df['split'].map(audioset_split_mapping)
audioset_counts = audioset_plane_df.groupby("unified_split").size().to_dict()

# ESC-50 - folds 1-4 as train, fold 5 as test
esc50_airplane_df = esc50_labels[esc50_labels['category'] == 'airplane'].copy()
esc50_airplane_df['unified_split'] = esc50_airplane_df['fold'].apply(lambda x: 'train' if x <= 4 else 'test')
esc50_counts = esc50_airplane_df.groupby('unified_split').size().to_dict()

# FSD50K - 'eval' becomes 'test', 'train' stays 'train'
fsd_plane_df = fsd_labels[fsd_labels['is_plane']].copy()
fsd_split_mapping = {'train': 'train', 'eval': 'test'}
fsd_plane_df['unified_split'] = fsd_plane_df['split'].map(fsd_split_mapping)
fsd_plane_counts = fsd_plane_df.groupby('unified_split').size().to_dict()

# SoundDescs - combine 'val' with 'train', 'test' stays 'test'
sounddesc_aircraft_df = grouped_sounddesc_split.explode('categories')
sounddesc_aircraft_df = sounddesc_aircraft_df[sounddesc_aircraft_df['categories'] == 'Aircraft'].copy()
sounddesc_split_mapping = {'train': 'train', 'val': 'train', 'test': 'test'}
sounddesc_aircraft_df['unified_split'] = sounddesc_aircraft_df['split'].map(sounddesc_split_mapping)
sounddesc_counts = sounddesc_aircraft_df.groupby('unified_split').size().to_dict()

# Create individual bar charts for each dataset
fig_audioset = px.bar(
    x=list(audioset_counts.keys()),
    y=list(audioset_counts.values()),
    title="AudioSet - Airplane Samples by Split",
    labels={"x": "Split", "y": "Count"},
    text=list(audioset_counts.values()),
)
fig_audioset.update_traces(textposition="inside", textangle=0)
fig_audioset.update_layout(width=600, height=500)
fig_audioset.show()

# ESC-50
fig_esc50 = px.bar(
    x=list(esc50_counts.keys()),
    y=list(esc50_counts.values()),
    title="ESC-50 - Airplane Samples by Split",
    labels={"x": "Split", "y": "Count"},
    text=list(esc50_counts.values()),
)
fig_esc50.update_layout(width=600, height=500)
fig_esc50.update_traces(textposition="inside", textangle=0)
fig_esc50.show()

# FSD50K
fig_fsd = px.bar(
    x=list(fsd_plane_counts.keys()),
    y=list(fsd_plane_counts.values()),
    title="FSD50K - Airplane Samples by Split",
    labels={"x": "Split", "y": "Count"},
    text=list(fsd_plane_counts.values()),
)
fig_fsd.update_traces(textposition="inside", textangle=0)
fig_fsd.update_layout(width=600, height=500)
fig_fsd.show()

# SoundDescs
fig_sounddesc = px.bar(
    x=list(sounddesc_counts.keys()),
    y=list(sounddesc_counts.values()),
    title="SoundDescs - Aircraft Samples by Split",
    labels={"x": "Split", "y": "Count"},
    text=list(sounddesc_counts.values()),
)
fig_sounddesc.update_traces(textposition="inside", textangle=0)
fig_sounddesc.update_layout(width=600, height=500)
fig_sounddesc.show()

# Combined comparison - Total samples per dataset
total_samples = {
    "AudioSet": sum(audioset_counts.values()),
    "ESC-50": sum(esc50_counts.values()),
    "FSD50K": sum(fsd_plane_counts.values()),
    "SoundDescs": sum(sounddesc_counts.values()),
}

fig_total = px.bar(
    x=list(total_samples.keys()),
    y=list(total_samples.values()),
    title="Total Airplane/Aircraft Samples Across All Datasets",
    labels={"x": "Dataset", "y": "Total Count"},
    text=list(total_samples.values()),
    color=list(total_samples.keys()),
)
fig_total.update_traces(textposition="inside", textangle=0)
fig_total.update_layout(showlegend=False, width=600, height=500)
fig_total.show()

# Grand total
grand_total = sum(total_samples.values())
print(f"\nGrand Total Airplane Samples: {grand_total}")

# Create a stacked bar chart showing splits across datasets
split_data = []
for dataset, counts_dict in [
    ("AudioSet", audioset_counts),
    ("ESC-50", esc50_counts),
    ("FSD50K", fsd_plane_counts),
    ("SoundDescs", sounddesc_counts),
]:
    for split, count in counts_dict.items():
        split_data.append({"Dataset": dataset, "Split": split, "Count": count})

split_df = pd.DataFrame(split_data)

fig_stacked = px.bar(
    split_df,
    x="Dataset",
    y="Count",
    color="Split",
    title="Airplane Samples Distribution by Dataset and Split",
    barmode="stack",
    text="Count",
)
fig_stacked.update_traces(textposition="inside", textangle=0)
fig_stacked.update_layout(width=500, height=600)
fig_stacked.show()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  audioset_plane_df['unified_split'] = audioset_plane_df['split'].map(audioset_split_mapping)



Grand Total Airplane Samples: 2025


In [33]:
# Create a pie chart showing overall train/test distribution across all datasets
overall_train = sum([counts_dict.get('train', 0) for counts_dict in [audioset_counts, esc50_counts, fsd_plane_counts, sounddesc_counts]])
overall_test = sum([counts_dict.get('test', 0) for counts_dict in [audioset_counts, esc50_counts, fsd_plane_counts, sounddesc_counts]])

fig_pie = go.Figure(data=[go.Pie(
    labels=['Train', 'Test'],
    values=[overall_train, overall_test],
    text=[overall_train, overall_test],
    textposition='inside',
    textinfo='label+value+percent',
    marker=dict(colors=['#636EFA', '#EF553B'])
)])

fig_pie.update_layout(
    title="Overall Train/Test Distribution Across All Datasets",
    width=600,
    height=500
)
fig_pie.show()

print(f"\nTotal Train Samples: {overall_train}")
print(f"Total Test Samples: {overall_test}")
print(f"Train/Test Ratio: {overall_train/overall_test:.2f}")


Total Train Samples: 1618
Total Test Samples: 407
Train/Test Ratio: 3.98
