In [31]:
from projects.callbird.src.readUtils import readLabeledMapping
from os import path

# Load datasets and replace None values

In [32]:
from datasets import load_dataset, Features, Value

train_dataset = load_dataset(
    "csv",
    data_files="/workspace/oekofor/trainset/csvlabels/*.csv",
    delimiter=";",
    features=Features(
        {
            "ebird_code": Value("string"),
            "call_type": Value("string"),
            "actual_filename": Value("string"),
            "common_name": Value("string"),
        }
    )
)

test_dataset = load_dataset("csv", data_files="/workspace/oekofor/testset/labels/*.csv")

train_dataset = train_dataset.map(lambda x: {"ebird_code": x["ebird_code"] if x["ebird_code"] is not None else "NA"})
test_dataset = test_dataset.map(lambda x: {"ebird_code": x["ebird_code"] if x["ebird_code"] is not None else "NA"})

train_dataset = train_dataset.map(lambda x: {"call_type": x["call_type"] if x["call_type"] is not None else "NA"})
test_dataset = test_dataset.map(lambda x: {"vocalization_type": x["vocalization_type"] if x["vocalization_type"] is not None else "NA"})

Resolving data files:   0%|          | 0/56 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/153 [00:00<?, ?it/s]

# Setup output files

In [33]:
output_root = "/workspace/projects/callbird/datastats"
output_root_train = f"{output_root}/train"
output_root_test = f"{output_root}/test"

output_blacklist_ebird_train = f"{output_root_train}/blacklist_ebird.txt"
output_blacklist_ebird_test = f"{output_root_test}/blacklist_ebird.txt"

output_blacklist_naive_train = f"{output_root_train}/blacklist_naive.txt"
output_blacklist_naive_test = f"{output_root_test}/blacklist_naive.txt"

output_blacklist_files_train = f"{output_root_train}/blacklist_files.txt"

file_calltype_mapping = "/workspace/projects/callbird/datastats/call_types_list"

# Determine ebird code blacklist (unused)

In [34]:
train_ebirdcodes = train_dataset["train"].unique("ebird_code")
test_ebirdcodes = test_dataset["train"].unique("ebird_code")

print(f"Train call types ('{len(train_ebirdcodes)}'):", train_ebirdcodes)
print(f"Test call types ('{len(test_ebirdcodes)}'):", test_ebirdcodes)

train_blacklist = set(train_ebirdcodes) - set(test_ebirdcodes)
test_blacklist = set(test_ebirdcodes) - set(train_ebirdcodes)

print(f"Train blacklist ('{len(train_blacklist)}'):", train_blacklist)
print(f"Test blacklist ('{len(test_blacklist)}'):", test_blacklist)

with open(output_blacklist_ebird_train, "w") as f:
    f.write("# eBird codes not present in the test set\n")
    for code in train_blacklist:
        f.write(f"{code}\n")

with open(output_blacklist_ebird_test, "w") as f:
    f.write("# eBird codes not present in the train set\n")
    for code in test_blacklist:
        f.write(f"{code}\n")

Train call types ('56'): ['blawoo1', 'carcro1', 'coatit2', 'comcha', 'comchi1', 'comcuc', 'firecr1', 'comnig1', 'comrav', 'cowpig1', 'cretit2', 'dunnoc1', 'eurbla', 'blackc1', 'blutit', 'eurbul', 'eugwoo2', 'eurjac', 'eurjay1', 'eurnut2', 'eursis', 'skylar', 'eurtre1', 'eurwoo', 'winwre4', 'eurgol', 'eurgre1', 'eupfly1', 'eurrob1', 'eursta', 'eutdov', 'fieldf', 'garwar1', 'goldcr1', 'gyfwoo1', 'grswoo', 'gretit1', 'grewhi1', 'hawfin', 'martit2', 'miswoo1', 'misthr1', 'norlap', 'redcro', 'shttre1', 'sonthr1', 'spofly1', 'stodov1', 'tawowl1', 'trepip', 'whtdip1', 'wiltit1', 'wlwwar', 'woowar', 'yellow2', 'NA']
Test call types ('60'): ['winwre4', 'comcha', 'gretit1', 'misthr1', 'hawfin', 'coatit2', 'eurrob1', 'blutit', 'grswoo', 'goldcr1', 'yellow2', 'shttre1', 'brambl', 'eurgol', 'eurnut2', 'NA', 'dunnoc1', 'sonthr1', 'norlap', 'eurbla', 'martit2', 'wlwwar', 'cowpig1', 'eurjay1', 'comchi1', 'cretit2', 'eugwoo2', 'firecr1', 'stodov1', 'eursta', 'tawowl1', 'blackc1', 'eurgre1', 'spofly1', 

# Ensuring valid call type mappign

In [35]:
train_calltypes = train_dataset["train"].unique("call_type")
test_calltypes = test_dataset["train"].unique("vocalization_type")

print(f"Train call types ('{len(train_calltypes)}'):", train_calltypes)
print(f"Test call types ('{len(test_calltypes)}'):", test_calltypes)

calltype_mapping = readLabeledMapping(file_calltype_mapping, None)
calltype_mapping_test = calltype_mapping["test"]
calltype_mapping_train = calltype_mapping["train"]

print(f"Call type mapping test ('{len(calltype_mapping_test)}'):", calltype_mapping_test)
print(f"Call type mapping train ('{len(calltype_mapping_train)}'):", calltype_mapping_train)

# Ensuring valid call type mapping
missing_train = set(train_calltypes) - set(calltype_mapping_train.keys())
missing_test = set(test_calltypes) - set(calltype_mapping_test.keys())

if missing_train:
    print(f"\nMissing call types in train mapping: {missing_train}")
if missing_test:
    print(f"\nMissing call types in test mapping: {missing_test}")
if not missing_train and not missing_test:
    print("\nAll call types are mapped correctly.")

# Update datasets with call type mappings
train_dataset = train_dataset.map(lambda x: {"short_call_type": calltype_mapping_train.get(x["call_type"], None)}) # Using None to force an error if the call type is not found
test_dataset = test_dataset.map(lambda x: {"short_call_type": calltype_mapping_test.get(x["vocalization_type"], None)}) # Using None to force an error if the vocalization type is not found

Train call types ('15'): ['f (Flugruf)', 'c (Kontaktruf)', 's (Gesang)', 't (Trommeln)', 'b (Bettelruf)', 'e (Erregungsruf)', 'ic (Interaktionsrufe)', 'ac (Alarmruf)', 's2 (Zweitgesang)', 'wb (Fluegelschlag)', 'nfc (Nachtzugruf)', 'rs (Rufe am Schlafplatz)', 'ac_b (Alarmruf_Bodenfeinde)', 'ac_f (Alarmruf Luftfeinde)', 'NA']
Test call types ('13'): ['song', 'contact call', 'flight call', 'something', 'drumming', 'alarm call', 'vocal', 'ground alarm call', 'call', 'whistle', 'air alarm call', 'excitement call', 'sub song']
Call type mapping test ('13'): {'song': 'sn', 'contact call': 'ct', 'flight call': 'fl', 'something': 'or', 'drumming': 'dr', 'alarm call': 'al', 'vocal': 'or', 'ground alarm call': 'ag', 'call': 'or', 'whistle': 'or', 'air alarm call': 'ai', 'excitement call': 'ex', 'sub song': 'st'}
Call type mapping train ('15'): {'f (Flugruf)': 'fl', 'c (Kontaktruf)': 'ct', 's (Gesang)': 'sn', 't (Trommeln)': 'dr', 'b (Bettelruf)': 'or', 'e (Erregungsruf)': 'ex', 'ic (Interaktionsr

# Determine naive classes

In [36]:
test_dataset = test_dataset.map(lambda x: { "ebird_code_and_call": f"{x['ebird_code']}_{x['short_call_type']}" })
train_dataset = train_dataset.map(lambda x: { "ebird_code_and_call": f"{x['ebird_code']}_{x['short_call_type']}" })

test_naive_classes = test_dataset["train"].unique("ebird_code_and_call")
train_naive_classes = train_dataset["train"].unique("ebird_code_and_call")

print(f"Test naive classes ('{len(test_naive_classes)}'):", test_naive_classes)
print(f"Train naive classes ('{len(train_naive_classes)}'):", train_naive_classes)

Test naive classes ('134'): ['winwre4_sn', 'comcha_ct', 'comcha_sn', 'gretit1_ct', 'misthr1_fl', 'comcha_fl', 'hawfin_or', 'coatit2_or', 'eurrob1_or', 'blutit_or', 'grswoo_or', 'goldcr1_or', 'yellow2_or', 'shttre1_or', 'brambl_fl', 'coatit2_sn', 'blutit_sn', 'eurgol_fl', 'eurnut2_ct', 'NA_ct', 'blutit_ct', 'dunnoc1_or', 'sonthr1_or', 'norlap_or', 'eurrob1_sn', 'sonthr1_sn', 'NA_or', 'eurbla_sn', 'martit2_sn', 'misthr1_sn', 'dunnoc1_sn', 'grswoo_dr', 'eurnut2_sn', 'wlwwar_sn', 'gretit1_sn', 'cowpig1_sn', 'NA_al', 'eurbla_al', 'eurjay1_ct', 'comchi1_sn', 'NA_sn', 'cretit2_ct', 'goldcr1_sn', 'comchi1_ct', 'eugwoo2_sn', 'winwre4_ct', 'martit2_ct', 'firecr1_sn', 'grswoo_ct', 'stodov1_sn', 'sonthr1_fl', 'stodov1_fl', 'eursta_sn', 'tawowl1_or', 'blackc1_sn', 'eurrob1_ct', 'shttre1_ct', 'eurbla_ag', 'shttre1_sn', 'eurgre1_fl', 'spofly1_ct', 'goldcr1_ct', 'gyfwoo1_sn', 'gyfwoo1_dr', 'eursta_or', 'firecr1_ct', 'sonthr1_al', 'woowar_sn', 'woowar_or', 'comcuc_sn', 'eurtre1_sn', 'blackc1_ct', 'eurb

# Determine naive blacklist (unused)

In [37]:
train_ebird_call_codes = train_dataset["train"].unique("ebird_code_and_call")
test_ebird_call_codes = test_dataset["train"].unique("ebird_code_and_call")

print(f"Train naive classes ('{len(train_ebird_call_codes)}'):", train_ebird_call_codes)
print(f"Test naive classes ('{len(test_ebird_call_codes)}'):", test_ebird_call_codes)

train_blacklist = set(train_ebird_call_codes) - set(test_ebird_call_codes)
test_blacklist = set(test_ebird_call_codes) - set(train_ebird_call_codes)

print(f"Train blacklist ('{len(train_blacklist)}'):", train_blacklist)
print(f"Test blacklist ('{len(test_blacklist)}'):", test_blacklist)

with open(output_blacklist_naive_train, "w") as f:
    f.write("# eBird codes not present in the test set\n")
    for code in train_blacklist:
        f.write(f"{code}\n")

with open(output_blacklist_naive_test, "w") as f:
    f.write("# eBird codes not present in the train set\n")
    for code in test_blacklist:
        f.write(f"{code}\n")

Train naive classes ('200'): ['blawoo1_fl', 'blawoo1_ct', 'blawoo1_sn', 'blawoo1_dr', 'blawoo1_or', 'blawoo1_ex', 'carcro1_fl', 'carcro1_ct', 'coatit2_ct', 'coatit2_sn', 'coatit2_fl', 'comcha_sn', 'comcha_ct', 'comcha_fl', 'comcha_al', 'comchi1_sn', 'comchi1_ct', 'comchi1_ex', 'comchi1_fl', 'comchi1_al', 'comcuc_sn', 'comcuc_st', 'comcuc_ct', 'comcuc_fl', 'firecr1_sn', 'firecr1_ct', 'firecr1_fl', 'comnig1_sn', 'comnig1_ct', 'comrav_ct', 'comrav_fl', 'comrav_or', 'cowpig1_sn', 'cowpig1_al', 'cowpig1_or', 'cretit2_ct', 'cretit2_sn', 'cretit2_or', 'dunnoc1_sn', 'dunnoc1_or', 'dunnoc1_fl', 'dunnoc1_ct', 'eurbla_ct', 'eurbla_sn', 'eurbla_or', 'eurbla_al', 'eurbla_ag', 'eurbla_fl', 'eurbla_st', 'blackc1_sn', 'blackc1_ct', 'blackc1_al', 'blackc1_fl', 'blackc1_ex', 'blackc1_st', 'blutit_ct', 'blutit_sn', 'blutit_al', 'blutit_fl', 'eurbul_or', 'eurbul_fl', 'eurbul_sn', 'eurbul_ct', 'eugwoo2_ct', 'eugwoo2_sn', 'eugwoo2_fl', 'eurjac_fl', 'eurjac_ct', 'eurjay1_ct', 'eurjay1_sn', 'eurjay1_al', 'eur

# Determine and apply missing data for train data

In [38]:
original_filepaths = set(train_dataset["train"]["actual_filename"])

train_dataset = train_dataset.filter(lambda x: path.exists(f"/workspace/oekofor/dataset/{x['actual_filename']}.flac") or path.exists(f"/workspace/oekofor/dataset/{x['actual_filename']}.wav"))

blacklist_train_files = set(original_filepaths) - set(train_dataset["train"]["actual_filename"])

print(f"Blacklist naive ('{len(blacklist_train_files)}'):", blacklist_train_files)

with open(output_blacklist_files_train, "w") as f:
    f.write("# File paths not present in the train set\n")
    for filepath in blacklist_train_files:
        f.write(f"{filepath}\n")

Blacklist naive ('0'): set()


# Apply filters (unused)

In [39]:
filtered_train_dataset = train_dataset.filter(lambda x: x["ebird_code_and_call"] not in train_blacklist)
filtered_test_dataset = test_dataset.filter(lambda x: x["ebird_code_and_call"] not in test_blacklist)


# Output filtered useable naive data (unused)

In [40]:
filtered_train_length = len(filtered_train_dataset["train"])
filtered_test_length = len(filtered_test_dataset["train"])

source_train_length = len(train_dataset["train"])
source_test_length = len(test_dataset["train"])

class_count_train = len(filtered_train_dataset["train"].unique("ebird_code_and_call"))
class_count_test = len(filtered_test_dataset["train"].unique("ebird_code_and_call"))

class_train_filtered_count = len(train_blacklist)
class_test_filtered_count = len(test_blacklist)

print(f"Filtered train dataset has length {filtered_train_length} (original 'valid' {source_train_length}) with {class_count_train} ({class_train_filtered_count} filtered) unique classes")
print(f"Filtered test  dataset has length {filtered_test_length} (original 'valid' {source_test_length}) with {class_count_test} ({class_test_filtered_count} filtered) unique classes")

Filtered train dataset has length 86514 (original 'valid' 91771) with 107 (93 filtered) unique classes
Filtered test  dataset has length 15555 (original 'valid' 16020) with 107 (27 filtered) unique classes


# Output used data latex stats

In [41]:
latex_output_file = "/workspace/projects/callbird/notebooks/latex_output"

train_length = len(train_dataset["train"])
test_length = len(test_dataset["train"])

class_count_train = len(train_dataset["train"].unique("ebird_code_and_call"))
class_count_test = len(test_dataset["train"].unique("ebird_code_and_call"))

print("Train data contains {} classes".format(class_count_train))
print("Test data contains {} classes".format(class_count_test))

train_ebirds = train_dataset["train"].unique("ebird_code")
test_ebirds = test_dataset["train"].unique("ebird_code")

train_ebird_common_name_dict = {}
# loop train dataset (not lists) and fill above dict
for example in train_dataset["train"]:
    ebird_code = example["ebird_code"]
    common_name = example["common_name"]
    if ebird_code not in train_ebird_common_name_dict:
        train_ebird_common_name_dict[ebird_code] = common_name
    elif train_ebird_common_name_dict[ebird_code] != common_name:
        print(f"Warning: eBird code '{ebird_code}' has multiple common names: '{train_ebird_common_name_dict[ebird_code]}' and '{common_name}'")

train_ebird_calls_count = {}
for example in train_dataset["train"]:
    ebird_code = example["ebird_code"]
    call_type = example["short_call_type"]
    if ebird_code not in train_ebird_calls_count:
        train_ebird_calls_count[ebird_code] = {}
    train_ebird_calls_count[ebird_code][call_type] = train_ebird_calls_count[ebird_code].get(call_type, 0) + 1

# map train_ebirds to names
train_ebird_common_names = [train_ebird_common_name_dict[ebird] for ebird in train_ebirds]
print(f"Train eBird common names ('{len(train_ebird_common_names)}'):", train_ebird_common_names)

train_calltypes = train_dataset["train"].unique("short_call_type")
test_calltypes = test_dataset["train"].unique("short_call_type")

print(f"Train eBird codes ('{len(train_ebirds)}'):", train_ebirds)
print(f"Test eBird codes ('{len(test_ebirds)}'):", test_ebirds)

print(f"Train call types ('{len(train_calltypes)}'):", train_calltypes)
print(f"Test call types ('{len(test_calltypes)}'):", test_calltypes)

with open(latex_output_file, "w") as f:
    coloumn_style = "|c" * (len(train_calltypes))
    f.write(f"\\begin{{tabular}}{{|l{coloumn_style}|}}\n")
    f.write("\\hline\n")
    calltype_columns = " & ".join(train_calltypes)
    f.write(f"eBird Code & {calltype_columns} \\\\\n")
    f.write("\\hline\n")
    for ebird in train_ebirds:
        # write the ebird in first coloumn and other a check or "-" whether a datapoint with exists
        # combined_code = f"{ebird}_{x['short_call_type']}"
        common_name = train_ebird_common_name_dict[ebird]
        calltypes_exists = [f"{ebird}_{calltype}" in train_dataset['train'].unique('ebird_code_and_call') for calltype in train_calltypes]
        # checkmark_symbol = "\\checkmark"
        calltype_bird_count = [train_ebird_calls_count[ebird].get(calltype, 0) for calltype in train_calltypes]
        f.write(f"{common_name} & {' & '.join([str(count) if count > 0 else '-' for count in calltype_bird_count])} \\\\\n")
    f.write("\\hline\n")
    f.write("\\end{tabular}\n")


Train data contains 200 classes
Test data contains 134 classes
Train eBird common names ('56'): ['Black Woodpecker', 'Carrion Crow', 'Coal Tit', 'Common Chaffinch', 'Common Chiffchaff', 'Common Cuckoo', 'Common Firecrest', 'Common Nightingale', 'Common Raven', 'Common Wood-Pigeon', 'Crested Tit', 'Dunnock', 'Eurasian Blackbird', 'Eurasian Blackcap', 'Eurasian Blue Tit', 'Eurasian Bullfinch', 'Eurasian Green Woodpecker', 'Eurasian Jackdaw', 'Eurasian Jay', 'Eurasian Nuthatch', 'Eurasian Siskin', 'Eurasian Skylark', 'Eurasian Treecreeper', 'Eurasian Woodcock', 'Eurasian Wren', 'European Goldfinch', 'European Greenfinch', 'European Pied Flycatcher', 'European Robin', 'European Starling', 'European Turtle-Dove', 'Fieldfare', 'Garden Warbler', 'Goldcrest', 'Gray-headed Woodpecker', 'Great Spotted Woodpecker', 'Great Tit', 'Greater Whitethroat', 'Hawfinch', 'Marsh Tit', 'Middle Spotted Woodpecker', 'Mistle Thrush', 'Northern Lapwing', 'Red Crossbill', 'Short-toed Treecreeper', 'Song Thrush',