In [5]:
from datasets import concatenate_datasets, load_dataset, Features, Value

In [6]:
test_dataset = load_dataset(
    "csv",
    data_files = "/workspace/oekofor/testset/labels/*.csv",
    cache_dir = None,
    num_proc = 1,
    trust_remote_code = True, # While not needed for local datasets, it is kept for consistency
)

test_dataset = test_dataset["train"]

Resolving data files:   0%|          | 0/153 [00:00<?, ?it/s]

In [7]:
train_dataset = load_dataset(
    "csv",
    data_files = "/workspace/oekofor/trainset/csvlabels/*.csv",
    delimiter=";",
    cache_dir = None,
    features = Features({ # TODO: Add all features available in BirdSet
        "ebird_code": Value("string"),
        "call_type": Value("string"),
        "start_sample [s]": Value("float"),
        "end_sample [s]": Value("float"),
        "actual_filename": Value("string"),
        "call_center [s]": Value("string"),
        "randomised_call_center [s]": Value("string"),
        "low_freq [Hz]": Value("string"),
        "high_freq [Hz]": Value("string"),
        "common_name": Value("string"),
        "subspecies": Value("string"),
        "sex": Value("string"),
        "correct": Value("string"),
        "confidence": Value("string"),
        "original_file.name": Value("string"),
        "name_of_validator": Value("string"),
        "date_of_validation": Value("string"),
        "latitude [WGS84]": Value("string"),
        "longitude [WGS84]": Value("string"),
        "uncertainty of GPS data [km]": Value("string"),
        "date": Value("string"),
        "BirdNET Version": Value("string"),
        "notes": Value("string"),
    }),
    num_proc = 1,
    trust_remote_code = True, # While not needed for local datasets, it is kept for consistency
)

train_dataset = train_dataset["train"]

Resolving data files:   0%|          | 0/56 [00:00<?, ?it/s]

In [12]:
test_no_bird_code = test_dataset.filter(lambda example: example["ebird_code"] is None)
train_no_bird_code = train_dataset.filter(lambda example: example["ebird_code"] is None)
print(f"Number of samples without ebird_code (test): {test_no_bird_code.num_rows} / {len(test_dataset)}")
print(f"Number of samples without ebird_code (train): {train_no_bird_code.num_rows} / {len(train_dataset)}")

test_no_bird_species_names = test_dataset.filter(lambda example: example["ebird_code"] is None or example["ebird_code"] == "").unique("common_name")
train_no_bird_species_names = train_dataset.filter(lambda example: example["ebird_code"] is None or example["ebird_code"] == "").unique("common_name")
print(f"Species without ebird_code (train): {train_no_bird_species_names}")
print(f"Species without common_name: {test_no_bird_species_names}")

bird_unspecified = test_dataset.filter(lambda example: example["common_name"] == "Bird") .num_rows
print(f"Number of test samples with common_name \"Bird\": {bird_unspecified} / {len(test_dataset)}")

test_availability_map = {}
train_availability_map = {}

for feature in test_dataset.features:
    num_total = len(test_dataset)
    num_available = test_dataset.filter(lambda example: example[feature] is not None and example[feature] != "").num_rows
    available = num_available > 0
    specicies_with_missing = test_dataset.filter(lambda example: example[feature] is None or example[feature] == "").num_rows
    test_availability_map[feature] = (available, num_available, num_total, specicies_with_missing)

for feature in train_dataset.features:
    num_total = len(train_dataset)
    num_available = train_dataset.filter(lambda example: example[feature] is not None and example[feature] != "").num_rows
    available = num_available > 0
    specicies_with_missing = train_dataset.filter(lambda example: example[feature] is None or example[feature] == "").num_rows
    train_availability_map[feature] = (available, num_available, num_total, specicies_with_missing)

for feature, (available, num_available, num_total, specicies_with_missing) in test_availability_map.items():
    latex_save_feature = feature.replace("_", "\\_")
    available = "\\checkmark" if available else "\\times"
    # print(f"{feature}: {available} ({num_available}/{num_total}), species with missing: {specicies_with_missing}")
    print(f"{latex_save_feature} & ${available}$ & {num_available} & {specicies_with_missing} \\\\")

print("\n---\n")

for feature, (available, num_available, num_total, specicies_with_missing) in train_availability_map.items():
    latex_save_feature = feature.replace("_", "\\_")
    available = "\\checkmark" if available else "\\times"
    # print(f"{feature}: {available} ({num_available}/{num_total}), species with missing: {specicies_with_missing}")
    print(f"{latex_save_feature} & ${available}$ & {num_available} & {specicies_with_missing} \\\\")

Number of samples without ebird_code (test): 955 / 16020
Number of samples without ebird_code (train): 30316 / 91771
Species without ebird_code (train): ['no bird']
Species without common_name: ['Bird', 'European Roe Deer']
Number of test samples with common_name "Bird": 941 / 16020
audio\_filename & $\checkmark$ & 16020 & 0 \\
start\_time & $\checkmark$ & 16020 & 0 \\
end\_time & $\checkmark$ & 16020 & 0 \\
low\_freq & $\checkmark$ & 16020 & 0 \\
high\_freq & $\checkmark$ & 16020 & 0 \\
lat & $\checkmark$ & 16020 & 0 \\
long & $\checkmark$ & 16020 & 0 \\
GPS\_uncertainty\_km & $\checkmark$ & 16020 & 0 \\
project\_code & $\checkmark$ & 16020 & 0 \\
recorder & $\checkmark$ & 16020 & 0 \\
pointofvocalization & $\times$ & 0 & 16020 \\
vocalization\_type & $\checkmark$ & 16020 & 0 \\
sex & $\times$ & 0 & 16020 \\
correct & $\checkmark$ & 16020 & 0 \\
time\_UTC & $\checkmark$ & 16020 & 0 \\
ebird\_code & $\checkmark$ & 15065 & 955 \\
tsn\_code & $\checkmark$ & 15079 & 941 \\
GBIF\_ID & $\ch