In [1]:
from datasets import IterableDataset

def print_dataset_info(name: str, dataset: IterableDataset):
    trainset = dataset["train"]

    print("")
    print(f"Dataset: {name}")
    print(f"Number of examples in the dataset: {len(trainset)}")
    print(f"Number of ebird_code labels: {len(trainset.unique('ebird_code'))}")
    print(f"Number of calltype labels: {len(trainset.unique('short_call_type'))}")
    print(f"Number of combined labels: {len(trainset.unique('ebird_code_and_call'))}")

    # If a calltype is None, print all features of that example
    none_calltypes = trainset.filter(lambda x: x["short_call_type"] is None)
    if len(none_calltypes) > 0:
        print(f"Examples with None as calltype label: {len(none_calltypes)}")
        for example in none_calltypes:
            print(example)
        raise ValueError("There are examples with None as calltype label. Please fix the mapping.")

    # print table of counts for each short_call_type label
    count_map = {}
    for call_type in trainset.unique("short_call_type"):
        count_map[call_type] = len(trainset.filter(lambda x: x["short_call_type"] == call_type))
    print("\nNumber of examples per calltype label:")
    print("{:<20} {:<10}".format("Call Type", "Count"))
    print("-" * 30)
    for call_type, count in count_map.items():
        print(f"{call_type} {count}")
    
    # print table of counts for each ebird_code label (top 10)
    count_map = {}
    for ebird_code in trainset.unique("ebird_code"):
        count_map[ebird_code] = len(trainset.filter(lambda x: x["ebird_code"] == ebird_code))
        sorted_count_map = dict(sorted(count_map.items(), key=lambda item: item[1], reverse=True))
    print("\nTop 10 most common ebird_code labels:")
    print("{:<10} {:<10}".format("Ebird Code", "Count"))
    print("-" * 20)
    for ebird_code, count in list(sorted_count_map.items())[:10]:
        print(f"{ebird_code} {count}")
    

In [None]:
from datasets import load_dataset, Features, Value

original_test_dataset = load_dataset(
    "csv",
    data_files = "/workspace/oekofor/testset/labels/*.csv",
    features = Features({ # TODO: Add all features available in BirdSet
        "ebird_code": Value("string"),
        "common_name": Value("string"),
        "vocalization_type": Value("string"),
        "start_time": Value("float"),
        "end_time": Value("float"),
        "audio_filename": Value("string"),
    }),
    cache_dir = None,
    num_proc = 1,
    trust_remote_code = True, # While not needed for local datasets, it is kept for consistency
)

# We need to remove None values from the 'ebird_code' column since the pipeline cannot handle them
original_test_dataset = original_test_dataset.map(lambda x: {"ebird_code": x["ebird_code"] if x["ebird_code"] is not None else ("UNKNOWN" if x["common_name"] == "Bird" else "NA")}) # TODO: Check if NA is an existing code
original_test_dataset = original_test_dataset.map(lambda x: {"vocalization_type": x["vocalization_type"] if x["vocalization_type"] is not None else "NA"}) # TODO: Check if NA is an existing code
original_test_dataset = original_test_dataset.map(lambda x: {"short_call_type": x["vocalization_type"]})
original_test_dataset = original_test_dataset.map(lambda x: { "ebird_code_and_call": f"{x['ebird_code']}_{x['short_call_type']}" })

original_train_dataset = load_dataset(
    "csv",
    data_files = "/workspace/oekofor/trainset/csvlabels/*.csv",
    features = Features({ # TODO: Add all features available in BirdSet
        "ebird_code": Value("string"),
        "call_type": Value("string"),
        "start_sample [s]": Value("float"),
        "end_sample [s]": Value("float"),
        "actual_filename": Value("string"),
    }),
    delimiter=";",
    cache_dir = None,
    num_proc = 1,
    trust_remote_code = True, # While not needed for local datasets, it is kept for consistency
)

# We need to remove None values from the 'ebird_code' column since the pipeline cannot handle them
original_train_dataset = original_train_dataset.map(lambda x: {"ebird_code": x["ebird_code"] if x["ebird_code"] is not None else "NA"})
original_train_dataset = original_train_dataset.map(lambda x: {"call_type": x["call_type"] if x["call_type"] is not None else "NA"})
original_train_dataset = original_train_dataset.map(lambda x: {"short_call_type": x["call_type"]})
original_train_dataset = original_train_dataset.map(lambda x: { "ebird_code_and_call": f"{x['ebird_code']}_{x['short_call_type']}" })

print_dataset_info("Original Train", original_train_dataset)
print_dataset_info("Original Test", original_test_dataset)

In [2]:
from callbird.src.datasets.load_test_dataset import load_test_dataset
from callbird.src.datasets.load_train_dataset import load_train_dataset

In [None]:

# As closely as possible to the original dataset
print_dataset_info("Raw Train", load_train_dataset("/workspace/projects/callbird/call_type_mappings/identity_map", None, filter_naive=None, limit_samples=False))
print_dataset_info("Raw Test", load_test_dataset("/workspace/projects/callbird/call_type_mappings/identity_map", None, filter_naive=None, unknown_ebird_code="NA", filter_unspecified=False))

In [None]:
# Stats for same space stage
print_dataset_info("Same Space Train", load_train_dataset("/workspace/projects/callbird/call_type_mappings/same_space_map", None, filter_naive=None, limit_samples=False))
print_dataset_info("Same Space Test", load_test_dataset("/workspace/projects/callbird/call_type_mappings/same_space_map", None, filter_naive=None, unknown_ebird_code="NA", filter_unspecified=False))

In [None]:
# Stats for limit samples stage
print_dataset_info("Limit Samples Train", load_train_dataset("/workspace/projects/callbird/call_type_mappings/same_space_map", None, filter_naive=None, limit_samples=True))
print_dataset_info("Limit Samples Test", load_test_dataset("/workspace/projects/callbird/call_type_mappings/same_space_map", None, filter_naive=None, unknown_ebird_code="NA", filter_unspecified=False))