In [1]:
from datasets import IterableDataset

def print_dataset_info(name: str, dataset: IterableDataset):
    trainset = dataset["train"]

    print("")
    print(f"Dataset: {name}")
    print(f"Number of examples in the dataset: {len(trainset)}")
    print(f"Number of ebird_code labels: {len(trainset.unique('ebird_code'))}")
    print(f"Number of calltype labels: {len(trainset.unique('short_call_type'))}")
    print(f"Number of combined labels: {len(trainset.unique('ebird_code_and_call'))}")

    # If a calltype is None, print all features of that example
    none_calltypes = trainset.filter(lambda x: x["short_call_type"] is None)
    if len(none_calltypes) > 0:
        print(f"Examples with None as calltype label: {len(none_calltypes)}")
        for example in none_calltypes:
            print(example)
        raise ValueError("There are examples with None as calltype label. Please fix the mapping.")

    # print table of counts for each short_call_type label
    count_map = {}
    for call_type in trainset.unique("short_call_type"):
        count_map[call_type] = len(trainset.filter(lambda x: x["short_call_type"] == call_type))
    print("\nNumber of examples per calltype label:")
    print("{:<20} {:<10}".format("Call Type", "Count"))
    print("-" * 30)
    for call_type, count in count_map.items():
        print(f"{call_type} {count}")
    
    # print table of counts for each ebird_code label (top 10)
    count_map = {}
    for ebird_code in trainset.unique("ebird_code"):
        count_map[ebird_code] = len(trainset.filter(lambda x: x["ebird_code"] == ebird_code))
        sorted_count_map = dict(sorted(count_map.items(), key=lambda item: item[1], reverse=True))
    print("\nTop 10 most common ebird_code labels:")
    print("{:<10} {:<10}".format("Ebird Code", "Count"))
    print("-" * 20)
    for ebird_code, count in list(sorted_count_map.items())[:10]:
        print(f"{ebird_code} {count}")
    

In [2]:
from datasets import load_dataset, Features, Value

original_test_dataset = load_dataset(
    "csv",
    data_files = "/workspace/oekofor/testset/labels/*.csv",
    features = Features({ # TODO: Add all features available in BirdSet
        "ebird_code": Value("string"),
        "common_name": Value("string"),
        "vocalization_type": Value("string"),
        "start_time": Value("float"),
        "end_time": Value("float"),
        "audio_filename": Value("string"),
    }),
    cache_dir = None,
    num_proc = 1,
    trust_remote_code = True, # While not needed for local datasets, it is kept for consistency
)

# We need to remove None values from the 'ebird_code' column since the pipeline cannot handle them
original_test_dataset = original_test_dataset.map(lambda x: {"ebird_code": x["ebird_code"] if x["ebird_code"] is not None else ("UNKNOWN" if x["common_name"] == "Bird" else "NA")}) # TODO: Check if NA is an existing code
original_test_dataset = original_test_dataset.map(lambda x: {"vocalization_type": x["vocalization_type"] if x["vocalization_type"] is not None else "NA"}) # TODO: Check if NA is an existing code
original_test_dataset = original_test_dataset.map(lambda x: {"short_call_type": x["vocalization_type"]})
original_test_dataset = original_test_dataset.map(lambda x: { "ebird_code_and_call": f"{x['ebird_code']}_{x['short_call_type']}" })

original_train_dataset = load_dataset(
    "csv",
    data_files = "/workspace/oekofor/trainset/csvlabels/*.csv",
    features = Features({ # TODO: Add all features available in BirdSet
        "ebird_code": Value("string"),
        "call_type": Value("string"),
        "start_sample [s]": Value("float"),
        "end_sample [s]": Value("float"),
        "actual_filename": Value("string"),
    }),
    delimiter=";",
    cache_dir = None,
    num_proc = 1,
    trust_remote_code = True, # While not needed for local datasets, it is kept for consistency
)

# We need to remove None values from the 'ebird_code' column since the pipeline cannot handle them
original_train_dataset = original_train_dataset.map(lambda x: {"ebird_code": x["ebird_code"] if x["ebird_code"] is not None else "NA"})
original_train_dataset = original_train_dataset.map(lambda x: {"call_type": x["call_type"] if x["call_type"] is not None else "NA"})
original_train_dataset = original_train_dataset.map(lambda x: {"short_call_type": x["call_type"]})
original_train_dataset = original_train_dataset.map(lambda x: { "ebird_code_and_call": f"{x['ebird_code']}_{x['short_call_type']}" })

print_dataset_info("Original Train", original_train_dataset)
print_dataset_info("Original Test", original_test_dataset)

Resolving data files:   0%|          | 0/153 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/56 [00:00<?, ?it/s]


Dataset: Original Train
Number of examples in the dataset: 91771
Number of ebird_code labels: 56
Number of calltype labels: 15
Number of combined labels: 212

Number of examples per calltype label:
Call Type            Count     
------------------------------
f (Flugruf) 6790
c (Kontaktruf) 10972
s (Gesang) 34755
t (Trommeln) 5266
b (Bettelruf) 232
e (Erregungsruf) 172
ic (Interaktionsrufe) 10
ac (Alarmruf) 722
s2 (Zweitgesang) 47
wb (Fluegelschlag) 5
nfc (Nachtzugruf) 1768
rs (Rufe am Schlafplatz) 704
ac_b (Alarmruf_Bodenfeinde) 11
ac_f (Alarmruf Luftfeinde) 1
NA 30316

Top 10 most common ebird_code labels:
Ebird Code Count     
--------------------
NA 30316
grswoo 6535
eurbla 5579
sonthr1 5527
comcha 3214
blawoo1 3119
eurrob1 2688
redcro 2462
tawowl1 2305
comchi1 2184

Dataset: Original Test
Number of examples in the dataset: 16020
Number of ebird_code labels: 61
Number of calltype labels: 13
Number of combined labels: 137

Number of examples per calltype label:
Call Type          

In [3]:
from callbird.src.datasets.load_test_dataset import load_test_dataset
from callbird.src.datasets.load_train_dataset import load_train_dataset

In [None]:

# As closely as possible to the original dataset
print_dataset_info("Raw Train", load_train_dataset("/workspace/projects/callbird/call_type_mappings/identity_map", None, filter_naive=None, limit_samples=False))
print_dataset_info("Raw Test", load_test_dataset("/workspace/projects/callbird/call_type_mappings/identity_map", None, filter_naive=None, unknown_ebird_code="NA", filter_unspecified=False))

In [None]:
# Stats for same space stage
print_dataset_info("Same Space Train", load_train_dataset("/workspace/projects/callbird/call_type_mappings/same_space_map", None, filter_naive=None, limit_samples=False))
print_dataset_info("Same Space Test", load_test_dataset("/workspace/projects/callbird/call_type_mappings/same_space_map", None, filter_naive=None, unknown_ebird_code="NA", filter_unspecified=False))

In [4]:
print_dataset_info("Same Space Train Filtered", load_train_dataset("/workspace/projects/callbird/call_type_mappings/same_space_map", None, filter_naive="/workspace/projects/callbird/blacklists/same_space_train.txt", limit_samples=False))
print_dataset_info("Same Space Test Filtered", load_test_dataset("/workspace/projects/callbird/call_type_mappings/same_space_map", None, filter_naive="/workspace/projects/callbird/blacklists/same_space_test.txt", unknown_ebird_code="NA", filter_unspecified=False))

Resolving data files:   0%|          | 0/56 [00:00<?, ?it/s]


Dataset: Same Space Train Filtered
Number of examples in the dataset: 84933
Number of ebird_code labels: 54
Number of calltype labels: 8
Number of combined labels: 98

Number of examples per calltype label:
Call Type            Count     
------------------------------
flight 5109
contact 9277
song 34529
excitement 167
alarm 556
alarm_ground 11
drumming 4968
NA 30316

Top 10 most common ebird_code labels:
Ebird Code Count     
--------------------
NA 30316
grswoo 6507
eurbla 4757
sonthr1 4673
comcha 3214
blawoo1 2816
redcro 2420
eurrob1 2352
comchi1 2179
eugwoo2 1709


Resolving data files:   0%|          | 0/153 [00:00<?, ?it/s]


Dataset: Same Space Test Filtered
Number of examples in the dataset: 15552
Number of ebird_code labels: 54
Number of calltype labels: 8
Number of combined labels: 98

Number of examples per calltype label:
Call Type            Count     
------------------------------
song 11017
contact 2841
flight 402
NA 636
drumming 82
alarm 532
alarm_ground 1
excitement 41

Top 10 most common ebird_code labels:
Ebird Code Count     
--------------------
eurbla 2659
comcha 1875
eurrob1 1601
sonthr1 1594
blackc1 1051
comchi1 907
NA 636
blutit 563
gretit1 504
winwre4 462


In [None]:
# Stats for limit samples stage
print_dataset_info("Limit Samples Train", load_train_dataset("/workspace/projects/callbird/call_type_mappings/same_space_map", None, filter_naive=None, limit_samples=True))
print_dataset_info("Limit Samples Test", load_test_dataset("/workspace/projects/callbird/call_type_mappings/same_space_map", None, filter_naive=None, unknown_ebird_code="NA", filter_unspecified=False))

In [5]:
# Stats for limit samples stage
print_dataset_info("Limit Samples Train", load_train_dataset("/workspace/projects/callbird/call_type_mappings/same_space_map", None, filter_naive="/workspace/projects/callbird/blacklists/limit_samples_train.txt", limit_samples=True))
print_dataset_info("Limit Samples Test", load_test_dataset("/workspace/projects/callbird/call_type_mappings/same_space_map", None, filter_naive="/workspace/projects/callbird/blacklists/limit_samples_test.txt", unknown_ebird_code="NA", filter_unspecified=False))

Resolving data files:   0%|          | 0/56 [00:00<?, ?it/s]


Dataset: Limit Samples Train
Number of examples in the dataset: 45427
Number of ebird_code labels: 54
Number of calltype labels: 8
Number of combined labels: 98

Number of examples per calltype label:
Call Type            Count     
------------------------------
flight 5109
contact 9148
excitement 167
alarm 556
alarm_ground 11
drumming 4564
NA 6000
song 19872

Top 10 most common ebird_code labels:
Ebird Code Count     
--------------------
NA 6000
grswoo 5974
eurbla 3067
sonthr1 2833
blawoo1 2625
redcro 2390
comcha 2374
eurrob1 1554
comchi1 1344
gretit1 1068


Resolving data files:   0%|          | 0/153 [00:00<?, ?it/s]


Dataset: Limit Samples Test
Number of examples in the dataset: 15552
Number of ebird_code labels: 54
Number of calltype labels: 8
Number of combined labels: 98

Number of examples per calltype label:
Call Type            Count     
------------------------------
song 11017
contact 2841
flight 402
NA 636
drumming 82
alarm 532
alarm_ground 1
excitement 41

Top 10 most common ebird_code labels:
Ebird Code Count     
--------------------
eurbla 2659
comcha 1875
eurrob1 1601
sonthr1 1594
blackc1 1051
comchi1 907
NA 636
blutit 563
gretit1 504
winwre4 462


In [6]:
# Stats for limit samples stage
print_dataset_info("Alarm Umbrella Class Train", load_train_dataset("/workspace/projects/callbird/call_type_mappings/alarm_merge_map", None, filter_naive="/workspace/projects/callbird/blacklists/alarm_merge_train.txt", limit_samples=True))
print_dataset_info("Alarm Umbrella Class Test", load_test_dataset("/workspace/projects/callbird/call_type_mappings/alarm_merge_map", None, filter_naive="/workspace/projects/callbird/blacklists/alarm_merge_test.txt", unknown_ebird_code="NA", filter_unspecified=False))

Resolving data files:   0%|          | 0/56 [00:00<?, ?it/s]


Dataset: Alarm Umbrella Class Train
Number of examples in the dataset: 45427
Number of ebird_code labels: 54
Number of calltype labels: 7
Number of combined labels: 97

Number of examples per calltype label:
Call Type            Count     
------------------------------
flight 5109
contact 9148
excitement 167
alarm 567
drumming 4564
NA 6000
song 19872

Top 10 most common ebird_code labels:
Ebird Code Count     
--------------------
NA 6000
grswoo 5974
eurbla 3067
sonthr1 2833
blawoo1 2625
redcro 2390
comcha 2374
eurrob1 1554
comchi1 1344
gretit1 1068


Resolving data files:   0%|          | 0/153 [00:00<?, ?it/s]


Dataset: Alarm Umbrella Class Test
Number of examples in the dataset: 15552
Number of ebird_code labels: 54
Number of calltype labels: 7
Number of combined labels: 97

Number of examples per calltype label:
Call Type            Count     
------------------------------
song 11017
contact 2841
flight 402
NA 636
drumming 82
alarm 533
excitement 41

Top 10 most common ebird_code labels:
Ebird Code Count     
--------------------
eurbla 2659
comcha 1875
eurrob1 1601
sonthr1 1594
blackc1 1051
comchi1 907
NA 636
blutit 563
gretit1 504
winwre4 462


In [7]:
# Stats for limit samples stage
print_dataset_info("Song Merge Train", load_train_dataset("/workspace/projects/callbird/call_type_mappings/song_merge_map", None, filter_naive="/workspace/projects/callbird/blacklists/song_merge_train.txt", limit_samples=True))
print_dataset_info("Song Merge Test", load_test_dataset("/workspace/projects/callbird/call_type_mappings/song_merge_map", None, filter_naive="/workspace/projects/callbird/blacklists/song_merge_test.txt", unknown_ebird_code="NA", filter_unspecified=False))

Resolving data files:   0%|          | 0/56 [00:00<?, ?it/s]


Dataset: Song Merge Train
Number of examples in the dataset: 45474
Number of ebird_code labels: 54
Number of calltype labels: 8
Number of combined labels: 98

Number of examples per calltype label:
Call Type            Count     
------------------------------
flight 5109
contact 9148
excitement 167
alarm 556
song 19919
alarm_ground 11
drumming 4564
NA 6000

Top 10 most common ebird_code labels:
Ebird Code Count     
--------------------
NA 6000
grswoo 5974
eurbla 3068
sonthr1 2833
blawoo1 2625
redcro 2390
comcha 2374
eurrob1 1555
comchi1 1344
gretit1 1073


Resolving data files:   0%|          | 0/153 [00:00<?, ?it/s]


Dataset: Song Merge Test
Number of examples in the dataset: 15593
Number of ebird_code labels: 54
Number of calltype labels: 8
Number of combined labels: 98

Number of examples per calltype label:
Call Type            Count     
------------------------------
song 11058
contact 2841
flight 402
NA 636
drumming 82
alarm 532
alarm_ground 1
excitement 41

Top 10 most common ebird_code labels:
Ebird Code Count     
--------------------
eurbla 2659
comcha 1906
eurrob1 1601
sonthr1 1594
blackc1 1051
comchi1 907
NA 636
blutit 563
gretit1 504
winwre4 462
