In [1]:
def generate_blacklist(train_dataset, test_dataset, name):
    train_ebird_call_codes = train_dataset["train"].unique("ebird_code_and_call")
    test_ebird_call_codes = test_dataset["train"].unique("ebird_code_and_call")

    print(f"Train naive classes ('{len(train_ebird_call_codes)}'):", train_ebird_call_codes)
    print(f"Test naive classes ('{len(test_ebird_call_codes)}'):", test_ebird_call_codes)

    train_blacklist = set(train_ebird_call_codes) - set(test_ebird_call_codes)
    test_blacklist = set(test_ebird_call_codes) - set(train_ebird_call_codes)

    print(f"Train blacklist ('{len(train_blacklist)}'):", train_blacklist)
    print(f"Test blacklist ('{len(test_blacklist)}'):", test_blacklist)

    with open(f"/workspace/projects/callbird/blacklists/{name}_train.txt", "w") as f:
        f.write("# eBird codes not present in the test set\n")
        for code in train_blacklist:
            f.write(f"{code}\n")

    with open(f"/workspace/projects/callbird/blacklists/{name}_test.txt", "w") as f:
        f.write("# eBird codes not present in the train set\n")
        for code in test_blacklist:
            f.write(f"{code}\n")

    # Apply blacklist to datasets
    filtered_train_dataset = train_dataset.filter(lambda example: example["ebird_code_and_call"] not in train_blacklist)
    filtered_test_dataset = test_dataset.filter(lambda example: example["ebird_code_and_call"] not in test_blacklist)

    filtered_train = filtered_train_dataset["train"]
    filtered_test = filtered_test_dataset["train"]
    train_train = train_dataset["train"]
    test_train = test_dataset["train"]
    print(f"Train classes before / after filtering: {len(train_ebird_call_codes)} / {len(filtered_train.unique('ebird_code_and_call'))}")
    print(f"Test classes before / after filtering: {len(test_ebird_call_codes)} / {len(filtered_test.unique('ebird_code_and_call'))}")
    print(f"Train before / after filtering: {len(train_train)} / {len(filtered_train)}")
    print(f"Test before / after filtering: {len(test_train)} / {len(filtered_test)}")

In [2]:
from callbird.src.datasets.load_test_dataset import load_test_dataset
from callbird.src.datasets.load_train_dataset import load_train_dataset

In [None]:
# Stats for same space stage
generate_blacklist(
    load_train_dataset("/workspace/projects/callbird/call_type_mappings/same_space_map", None, filter_naive=None, limit_samples=False),
    load_test_dataset("/workspace/projects/callbird/call_type_mappings/same_space_map", None, filter_naive=None, unknown_ebird_code="NA", filter_unspecified=False),
    "same_space"
)

In [None]:
# Stats for limit samples stage
generate_blacklist(
    load_train_dataset("/workspace/projects/callbird/call_type_mappings/same_space_map", None, filter_naive=None, limit_samples=True),
    load_test_dataset("/workspace/projects/callbird/call_type_mappings/same_space_map", None, filter_naive=None, unknown_ebird_code="NA", filter_unspecified=False),
    "limit_samples"
)

In [None]:
generate_blacklist(
    load_train_dataset("/workspace/projects/callbird/call_type_mappings/alarm_merge_map", None, filter_naive=None, limit_samples=True),
    load_test_dataset("/workspace/projects/callbird/call_type_mappings/alarm_merge_map", None, filter_naive=None, unknown_ebird_code="NA", filter_unspecified=False),
    "alarm_merge"
)

In [3]:
generate_blacklist(
    load_train_dataset("/workspace/projects/callbird/call_type_mappings/song_merge_map", None, filter_naive=None, limit_samples=True),
    load_test_dataset("/workspace/projects/callbird/call_type_mappings/song_merge_map", None, filter_naive=None, unknown_ebird_code="NA", filter_unspecified=False),
    "song_merge"
)

Resolving data files:   0%|          | 0/56 [00:00<?, ?it/s]

Map:   0%|          | 0/52167 [00:00<?, ? examples/s]

Map:   0%|          | 0/52167 [00:00<?, ? examples/s]

Map:   0%|          | 0/52167 [00:00<?, ? examples/s]

Map:   0%|          | 0/52167 [00:00<?, ? examples/s]

Resolving data files:   0%|          | 0/153 [00:00<?, ?it/s]

Map:   0%|          | 0/15950 [00:00<?, ? examples/s]

Map:   0%|          | 0/15950 [00:00<?, ? examples/s]

Map:   0%|          | 0/15950 [00:00<?, ? examples/s]

Map:   0%|          | 0/15950 [00:00<?, ? examples/s]

Train naive classes ('190'): ['blawoo1_flight', 'blawoo1_contact', 'blawoo1_drumming', 'blawoo1_other', 'blawoo1_excitement', 'carcro1_flight', 'carcro1_contact', 'coatit2_contact', 'coatit2_flight', 'comcha_contact', 'comcha_flight', 'comcha_alarm', 'comchi1_contact', 'comchi1_excitement', 'comchi1_flight', 'comchi1_alarm', 'comcuc_song', 'comcuc_contact', 'comcuc_flight', 'firecr1_contact', 'firecr1_flight', 'comnig1_contact', 'comrav_contact', 'comrav_flight', 'comrav_NA', 'cowpig1_alarm', 'cowpig1_NA', 'cretit2_contact', 'cretit2_other', 'dunnoc1_other', 'dunnoc1_flight', 'dunnoc1_contact', 'eurbla_contact', 'eurbla_other', 'eurbla_alarm', 'eurbla_alarm_ground', 'eurbla_flight', 'eurbla_song', 'blackc1_contact', 'blackc1_alarm', 'blackc1_flight', 'blackc1_excitement', 'blackc1_song', 'blutit_contact', 'blutit_alarm', 'blutit_flight', 'eurbul_other', 'eurbul_flight', 'eurbul_contact', 'eugwoo2_contact', 'eugwoo2_flight', 'eurjac_flight', 'eurjac_contact', 'eurjay1_contact', 'eurjay1

Filter:   0%|          | 0/52167 [00:00<?, ? examples/s]

Filter:   0%|          | 0/15950 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/45474 [00:00<?, ? examples/s]

Train classes before / after filtering: 190 / 98


Flattening the indices:   0%|          | 0/15593 [00:00<?, ? examples/s]

Test classes before / after filtering: 122 / 98
Train before / after filtering: 52167 / 45474
Test before / after filtering: 15950 / 15593


In [4]:
generate_blacklist(
    load_train_dataset("/workspace/projects/callbird/call_type_mappings/unspecified_map", None, filter_naive=None, limit_samples=True),
    load_test_dataset("/workspace/projects/callbird/call_type_mappings/unspecified_map", None, filter_naive=None, unknown_ebird_code="NA", filter_unspecified=True),
    "unspecified"
)

Resolving data files:   0%|          | 0/56 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/153 [00:00<?, ?it/s]

Map:   0%|          | 0/15950 [00:00<?, ? examples/s]

Map:   0%|          | 0/15950 [00:00<?, ? examples/s]

Filter:   0%|          | 0/15950 [00:00<?, ? examples/s]

Map:   0%|          | 0/15009 [00:00<?, ? examples/s]

Map:   0%|          | 0/15009 [00:00<?, ? examples/s]

Train naive classes ('190'): ['blawoo1_flight', 'blawoo1_contact', 'blawoo1_drumming', 'blawoo1_other', 'blawoo1_excitement', 'carcro1_flight', 'carcro1_contact', 'coatit2_contact', 'coatit2_flight', 'comcha_contact', 'comcha_flight', 'comcha_alarm', 'comchi1_contact', 'comchi1_excitement', 'comchi1_flight', 'comchi1_alarm', 'comcuc_song', 'comcuc_contact', 'comcuc_flight', 'firecr1_contact', 'firecr1_flight', 'comnig1_contact', 'comrav_contact', 'comrav_flight', 'comrav_NA', 'cowpig1_alarm', 'cowpig1_NA', 'cretit2_contact', 'cretit2_other', 'dunnoc1_other', 'dunnoc1_flight', 'dunnoc1_contact', 'eurbla_contact', 'eurbla_other', 'eurbla_alarm', 'eurbla_alarm_ground', 'eurbla_flight', 'eurbla_song', 'blackc1_contact', 'blackc1_alarm', 'blackc1_flight', 'blackc1_excitement', 'blackc1_song', 'blutit_contact', 'blutit_alarm', 'blutit_flight', 'eurbul_other', 'eurbul_flight', 'eurbul_contact', 'eugwoo2_contact', 'eugwoo2_flight', 'eurjac_flight', 'eurjac_contact', 'eurjay1_contact', 'eurjay1

Filter:   0%|          | 0/15009 [00:00<?, ? examples/s]

Train classes before / after filtering: 190 / 98


Flattening the indices:   0%|          | 0/14971 [00:00<?, ? examples/s]

Test classes before / after filtering: 117 / 98
Train before / after filtering: 52167 / 45474
Test before / after filtering: 15009 / 14971
