In [6]:
from datasets import IterableDataset

def print_dataset_info(name: str, dataset: IterableDataset):
    trainset = dataset["train"]

    print("")
    print(f"Dataset: {name}")
    print(f"Number of examples in the dataset: {len(trainset)}")
    print(f"Number of ebird_code labels: {len(trainset.unique('ebird_code'))}")
    print(f"Number of calltype labels: {len(trainset.unique('short_call_type'))}")
    print(f"Number of combined labels: {len(trainset.unique('ebird_code_and_call'))}")

    # If a calltype is None, print all features of that example
    none_calltypes = trainset.filter(lambda x: x["short_call_type"] is None)
    if len(none_calltypes) > 0:
        print(f"Examples with None as calltype label: {len(none_calltypes)}")
        for example in none_calltypes:
            print(example)
        raise ValueError("There are examples with None as calltype label. Please fix the mapping.")

    # print table of counts for each short_call_type label
    count_map = {}
    for call_type in trainset.unique("short_call_type"):
        count_map[call_type] = len(trainset.filter(lambda x: x["short_call_type"] == call_type))
    print("\nNumber of examples per calltype label:")
    print("{:<20} {:<10}".format("Call Type", "Count"))
    print("-" * 30)
    for call_type, count in count_map.items():
        print(f"{call_type} {count}")
    
    

In [7]:
from datasets import load_dataset, Features, Value

original_test_dataset = load_dataset(
    "csv",
    data_files = "/workspace/oekofor/testset/labels/*.csv",
    features = Features({ # TODO: Add all features available in BirdSet
        "ebird_code": Value("string"),
        "common_name": Value("string"),
        "vocalization_type": Value("string"),
        "start_time": Value("float"),
        "end_time": Value("float"),
        "audio_filename": Value("string"),
    }),
    cache_dir = None,
    num_proc = 1,
    trust_remote_code = True, # While not needed for local datasets, it is kept for consistency
)

# We need to remove None values from the 'ebird_code' column since the pipeline cannot handle them
original_test_dataset = original_test_dataset.map(lambda x: {"ebird_code": x["ebird_code"] if x["ebird_code"] is not None else ("UNKNOWN" if x["common_name"] == "Bird" else "NA")}) # TODO: Check if NA is an existing code
original_test_dataset = original_test_dataset.map(lambda x: {"vocalization_type": x["vocalization_type"] if x["vocalization_type"] is not None else "NA"}) # TODO: Check if NA is an existing code
original_test_dataset = original_test_dataset.map(lambda x: {"short_call_type": x["vocalization_type"]})
original_test_dataset = original_test_dataset.map(lambda x: { "ebird_code_and_call": f"{x['ebird_code']}_{x['short_call_type']}" })

original_train_dataset = load_dataset(
    "csv",
    data_files = "/workspace/oekofor/trainset/csvlabels/*.csv",
    features = Features({ # TODO: Add all features available in BirdSet
        "ebird_code": Value("string"),
        "call_type": Value("string"),
        "start_sample [s]": Value("float"),
        "end_sample [s]": Value("float"),
        "actual_filename": Value("string"),
    }),
    delimiter=";",
    cache_dir = None,
    num_proc = 1,
    trust_remote_code = True, # While not needed for local datasets, it is kept for consistency
)

# We need to remove None values from the 'ebird_code' column since the pipeline cannot handle them
original_train_dataset = original_train_dataset.map(lambda x: {"ebird_code": x["ebird_code"] if x["ebird_code"] is not None else "NA"})
original_train_dataset = original_train_dataset.map(lambda x: {"call_type": x["call_type"] if x["call_type"] is not None else "NA"})
original_train_dataset = original_train_dataset.map(lambda x: {"short_call_type": x["call_type"]})
original_train_dataset = original_train_dataset.map(lambda x: { "ebird_code_and_call": f"{x['ebird_code']}_{x['short_call_type']}" })

print_dataset_info("Original Train", original_train_dataset)
print_dataset_info("Original Test", original_test_dataset)

Resolving data files:   0%|          | 0/153 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/56 [00:00<?, ?it/s]


Dataset: Original Train
Number of examples in the dataset: 91771
Number of ebird_code labels: 56
Number of calltype labels: 15
Number of combined labels: 212

Number of examples per calltype label:
Call Type            Count     
------------------------------
f (Flugruf) 6790
c (Kontaktruf) 10972
s (Gesang) 34755
t (Trommeln) 5266
b (Bettelruf) 232
e (Erregungsruf) 172
ic (Interaktionsrufe) 10
ac (Alarmruf) 722
s2 (Zweitgesang) 47
wb (Fluegelschlag) 5
nfc (Nachtzugruf) 1768
rs (Rufe am Schlafplatz) 704
ac_b (Alarmruf_Bodenfeinde) 11
ac_f (Alarmruf Luftfeinde) 1
NA 30316

Dataset: Original Test
Number of examples in the dataset: 16020
Number of ebird_code labels: 61
Number of calltype labels: 13
Number of combined labels: 137

Number of examples per calltype label:
Call Type            Count     
------------------------------
song 11078
contact call 3006
flight call 411
something 651
drumming 85
alarm call 592
vocal 1
ground alarm call 1
call 5
whistle 1
air alarm call 107
excitement

In [8]:
from callbird.src.datasets.load_test_dataset import load_test_dataset
from callbird.src.datasets.load_train_dataset import load_train_dataset

print_dataset_info("Raw Train", load_train_dataset("/workspace/projects/callbird/call_type_mappings/identity_map", None, None))
print_dataset_info("Raw Test", load_test_dataset("/workspace/projects/callbird/call_type_mappings/identity_map", None, None, "NA"))

Resolving data files:   0%|          | 0/56 [00:00<?, ?it/s]


Dataset: Raw Train
Number of examples in the dataset: 91771
Number of ebird_code labels: 56
Number of calltype labels: 15
Number of combined labels: 212

Number of examples per calltype label:
Call Type            Count     
------------------------------
flight 6790
contact 10972
song 34755
drumming 5266
begging 232
excitement 172
interaction 10
alarm 722
sub_song 47
wing_flap 5
night_migration 1768
sleep_site 704
alarm_ground 11
alarm_air 1
NA 30316


Resolving data files:   0%|          | 0/153 [00:00<?, ?it/s]


Dataset: Raw Test
Number of examples in the dataset: 15950
Number of ebird_code labels: 56
Number of calltype labels: 13
Number of combined labels: 127

Number of examples per calltype label:
Call Type            Count     
------------------------------
song 11033
contact 2992
flight 404
something 650
drumming 85
NA 14
alarm 576
alarm_ground 1
call 5
whistle 1
alarm_air 107
excitement 41
sub_song 41


In [9]:
print_dataset_info("Initial Train", load_train_dataset("/workspace/projects/callbird/call_type_mappings/same_space_map", None, None))
print_dataset_info("Initial Test", load_test_dataset("/workspace/projects/callbird/call_type_mappings/same_space_map", None, None, "NA"))

Resolving data files:   0%|          | 0/56 [00:00<?, ?it/s]


Dataset: Initial Train
Number of examples in the dataset: 91771
Number of ebird_code labels: 56
Number of calltype labels: 11
Number of combined labels: 200

Number of examples per calltype label:
Call Type            Count     
------------------------------
flight 6790
contact 10972
song 34755
drumming 5266
other 2714
excitement 172
alarm 722
sub_song 47
NA 30321
alarm_ground 11
alarm_air 1


Resolving data files:   0%|          | 0/153 [00:00<?, ?it/s]


Dataset: Initial Test
Number of examples in the dataset: 15950
Number of ebird_code labels: 56
Number of calltype labels: 11
Number of combined labels: 126

Number of examples per calltype label:
Call Type            Count     
------------------------------
song 11033
contact 2992
flight 404
NA 669
drumming 85
alarm 576
alarm_ground 1
other 1
alarm_air 107
excitement 41
sub_song 41


In [10]:
print_dataset_info("Initial Train", load_train_dataset("/workspace/projects/callbird/call_type_mappings/same_space_map", None, "/workspace/projects/callbird/blacklists/same_space_train.txt"))
print_dataset_info("Initial Test", load_test_dataset("/workspace/projects/callbird/call_type_mappings/same_space_map", None, "/workspace/projects/callbird/blacklists/same_space_test.txt", "NA"))

Resolving data files:   0%|          | 0/56 [00:00<?, ?it/s]


Dataset: Initial Train
Number of examples in the dataset: 84933
Number of ebird_code labels: 54
Number of calltype labels: 8
Number of combined labels: 98

Number of examples per calltype label:
Call Type            Count     
------------------------------
flight 5109
contact 9277
song 34529
excitement 167
alarm 556
alarm_ground 11
drumming 4968
NA 30316


Resolving data files:   0%|          | 0/153 [00:00<?, ?it/s]


Dataset: Initial Test
Number of examples in the dataset: 15552
Number of ebird_code labels: 54
Number of calltype labels: 8
Number of combined labels: 98

Number of examples per calltype label:
Call Type            Count     
------------------------------
song 11017
contact 2841
flight 402
NA 636
drumming 82
alarm 532
alarm_ground 1
excitement 41
