# Load datasets

In [1]:
from datasets import load_dataset, Features, Value

train_dataset = load_dataset(
    "csv",
    data_files="/workspace/oekofor/trainset/csvlabels/*.csv",
    delimiter=";",
    features=Features(
        {
            "ebird_code": Value("string"),
            "call_type": Value("string"),
        }
    )
)

test_dataset = load_dataset("csv", data_files="/workspace/oekofor/testset/labels/*.csv")

Resolving data files:   0%|          | 0/56 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/153 [00:00<?, ?it/s]

# Setup output files

In [8]:
output_root = "/workspace/projects/callbird/datastats"
output_root_train = f"{output_root}/train"
output_root_test = f"{output_root}/test"

output_blacklist_train = f"{output_root_train}/blacklist.txt"
output_blacklist_test = f"{output_root_test}/blacklist.txt"

# Determine ebird code blacklist

In [12]:
train_ebirdcodes = train_dataset["train"].unique("ebird_code")
test_ebirdcodes = test_dataset["train"].unique("ebird_code")

print(f"Train call types ('{len(train_ebirdcodes)}'):", train_ebirdcodes)
print(f"Test call types ('{len(test_ebirdcodes)}'):", test_ebirdcodes)

train_blacklist = set(train_ebirdcodes) - set(test_ebirdcodes)
test_blacklist = set(test_ebirdcodes) - set(train_ebirdcodes)

print(f"Train blacklist ('{len(train_blacklist)}'):", train_blacklist)
print(f"Test blacklist ('{len(test_blacklist)}'):", test_blacklist)

with open(output_blacklist_train, "w") as f:
    f.write("# eBird codes not present in the test set\n")
    for code in train_blacklist:
        f.write(f"{code}\n")

with open(output_blacklist_test, "w") as f:
    f.write("# eBird codes not present in the train set\n")
    for code in test_blacklist:
        f.write(f"{code}\n")

Train call types ('56'): ['blawoo1', 'carcro1', 'coatit2', 'comcha', 'comchi1', 'comcuc', 'firecr1', 'comnig1', 'comrav', 'cowpig1', 'cretit2', 'dunnoc1', 'eurbla', 'blackc1', 'blutit', 'eurbul', 'eugwoo2', 'eurjac', 'eurjay1', 'eurnut2', 'eursis', 'skylar', 'eurtre1', 'eurwoo', 'winwre4', 'eurgol', 'eurgre1', 'eupfly1', 'eurrob1', 'eursta', 'eutdov', 'fieldf', 'garwar1', 'goldcr1', 'gyfwoo1', 'grswoo', 'gretit1', 'grewhi1', 'hawfin', 'martit2', 'miswoo1', 'misthr1', 'norlap', 'redcro', 'shttre1', 'sonthr1', 'spofly1', 'stodov1', 'tawowl1', 'trepip', 'whtdip1', 'wiltit1', 'wlwwar', 'woowar', 'yellow2', None]
Test call types ('60'): ['winwre4', 'comcha', 'gretit1', 'misthr1', 'hawfin', 'coatit2', 'eurrob1', 'blutit', 'grswoo', 'goldcr1', 'yellow2', 'shttre1', 'brambl', 'eurgol', 'eurnut2', None, 'dunnoc1', 'sonthr1', 'norlap', 'eurbla', 'martit2', 'wlwwar', 'cowpig1', 'eurjay1', 'comchi1', 'cretit2', 'eugwoo2', 'firecr1', 'stodov1', 'eursta', 'tawowl1', 'blackc1', 'eurgre1', 'spofly1', 

# Create unified call_type values

In [None]:
train_calltypes = train_dataset["train"].unique("call_type")
test_calltypes = test_dataset["train"].unique("vocalization_type")

print(f"Train call types ('{len(train_calltypes)}'):", train_calltypes)
print(f"Test call types ('{len(test_calltypes)}'):", test_calltypes)

""" Groupings used:
Flight : fl
Contact : ct
Song : sn
Song 2 : st
Drumming : dr
Exitment : ex
Alert : al
Alert ground : ag
Alert air : ai
Other : or
"""

train_map = {
    "f (Flugruf)" : "fl",
    "c (Kontaktruf)" : "ct",
    "s (Gesang)" : "sn",
    "t (Trommeln)" : "dr",
    "b (Bettelruf)" : "or",
    "e (Erregungsruf)" : "ex",
    "ic (Interaktionsrufe)" : "or",
    "ac (Alarmruf)" : "al",
    "s2 (Zweitgesang)" : "st",
    "wb (Fluegelschlag)" : "or",
    "nfc (Nachtzugruf)" : "or",
    "rs (Rufe am Schlafplatz)" : "or",
    "ac_b (Alarmruf_Bodenfeinde)" : "ag",
    "ac_f (Alarmruf Luftfeinde)" : "ai",
    "None" : "or"
}

test_map = {
    "song" : "sn",
    "contact call" : "ct",
    "flight call" : "fl",
    "something" : "or",
    "drumming" : "dr",
    "alarm call" : "al",
    "vocal" : "or",
    "ground alarm call" : "ag",
    "call" : "or",
    "whistle" : "or",
    "air alarm call" : "ai",
    "excitement call" : "ex",
    "sub song" : "st"
}

Train call types ('15'): ['f (Flugruf)', 'c (Kontaktruf)', 's (Gesang)', 't (Trommeln)', 'b (Bettelruf)', 'e (Erregungsruf)', 'ic (Interaktionsrufe)', 'ac (Alarmruf)', 's2 (Zweitgesang)', 'wb (Fluegelschlag)', 'nfc (Nachtzugruf)', 'rs (Rufe am Schlafplatz)', 'ac_b (Alarmruf_Bodenfeinde)', 'ac_f (Alarmruf Luftfeinde)', None]
Test call types ('13'): ['song', 'contact call', 'flight call', 'something', 'drumming', 'alarm call', 'vocal', 'ground alarm call', 'call', 'whistle', 'air alarm call', 'excitement call', 'sub song']
