In [None]:
output_root = "/workspace/projects/callbird/datastats"
output_root_train = f"{output_root}/train"
output_root_test = f"{output_root}/test"

output_blacklist_ebird_train = f"{output_root_train}/blacklist_ebird.txt"
output_blacklist_ebird_test = f"{output_root_test}/blacklist_ebird.txt"

output_blacklist_naive_train = f"{output_root_train}/blacklist_naive.txt"
output_blacklist_naive_test = f"{output_root_test}/blacklist_naive.txt"

output_blacklist_files_train = f"{output_root_train}/blacklist_files.txt"

file_calltype_mapping = "/workspace/projects/callbird/datastats/call_types_list"

In [None]:
from callbird.src.readUtils import readLabeledMapping, readCommentedList
from datasets import load_dataset, Features, Value, concatenate_datasets
from os import path

In [None]:
cache_dir = None

In [None]:
test_dataset = load_dataset(
    "csv",
    data_files = "/workspace/oekofor/testset/labels/*.csv",
    features = Features({ # TODO: Add all features available in BirdSet
        "ebird_code": Value("string"),
        "common_name": Value("string"),
        "vocalization_type": Value("string"),
        "start_time": Value("float"),
        "end_time": Value("float"),
        "audio_filename": Value("string"),
    }),
    cache_dir = cache_dir,
    num_proc = 1,
    trust_remote_code = True, # While not needed for local datasets, it is kept for consistency
)
original_test_dataset_length = len(test_dataset["train"])

train_dataset = load_dataset(
    "csv",
    data_files = "/workspace/oekofor/trainset/csvlabels/*.csv",
    features = Features({ # TODO: Add all features available in BirdSet
        "ebird_code": Value("string"),
        "call_type": Value("string"),
        "start_sample [s]": Value("float"),
        "end_sample [s]": Value("float"),
        "actual_filename": Value("string"),
    }),
    delimiter=";",
    cache_dir = cache_dir,
    num_proc = 1,
    trust_remote_code = True, # While not needed for local datasets, it is kept for consistency
)
original_train_dataset_length = len(train_dataset["train"])

In [None]:
train_dataset = train_dataset.map(lambda x: {"ebird_code": x["ebird_code"] if x["ebird_code"] is not None else "NA"})
# For the test dataset we need to handle a special case, where there are two different reasons a ebird code is missing
test_dataset = test_dataset.map(lambda x: {"ebird_code": x["ebird_code"] if x["ebird_code"] is not None else ("UNKNOWN" if x["common_name"] == "Bird" else "NA")})

train_dataset = train_dataset.map(lambda x: {"call_type": x["call_type"] if x["call_type"] is not None else "NA"})
test_dataset = test_dataset.map(lambda x: {"vocalization_type": x["vocalization_type"] if x["vocalization_type"] is not None else "NA"})

oritinal_test_data_copyset = test_dataset
oritinal_train_data_copyset = train_dataset

# list all call types
print("Unique call types in train dataset:", set(train_dataset['train']['call_type']))
print("Unique vocalization types in test dataset:", set(test_dataset['train']['vocalization_type']))

In [None]:
train_ebirdcodes = train_dataset["train"].unique("ebird_code")
test_ebirdcodes = test_dataset["train"].unique("ebird_code")

print(f"Train call types ('{len(train_ebirdcodes)}'):", train_ebirdcodes)
print(f"Test call types ('{len(test_ebirdcodes)}'):", test_ebirdcodes)

train_blacklist = set(train_ebirdcodes) - set(test_ebirdcodes)
test_blacklist = set(test_ebirdcodes) - set(train_ebirdcodes)

print(f"Train blacklist ('{len(train_blacklist)}'):", train_blacklist)
print(f"Test blacklist ('{len(test_blacklist)}'):", test_blacklist)

with open(output_blacklist_ebird_train, "w") as f:
    f.write("# eBird codes not present in the test set\n")
    for code in train_blacklist:
        f.write(f"{code}\n")

with open(output_blacklist_ebird_test, "w") as f:
    f.write("# eBird codes not present in the train set\n")
    for code in test_blacklist:
        f.write(f"{code}\n")

# Apply blacklists
ebird_blacklist_train = readCommentedList(output_blacklist_ebird_train)
ebird_blacklist_test = readCommentedList(output_blacklist_ebird_test)

test_dataset = test_dataset.filter(lambda x: x["ebird_code"] not in ebird_blacklist_test)
train_dataset = train_dataset.filter(lambda x: x["ebird_code"] not in ebird_blacklist_train)

In [None]:
train_calltypes = train_dataset["train"].unique("call_type")
test_calltypes = test_dataset["train"].unique("vocalization_type")

print(f"Train call types ('{len(train_calltypes)}'):", train_calltypes)
print(f"Test call types ('{len(test_calltypes)}'):", test_calltypes)

calltype_mapping = readLabeledMapping(file_calltype_mapping, None)
calltype_mapping_test = calltype_mapping["test"]
calltype_mapping_train = calltype_mapping["train"]

print(f"Call type mapping test ('{len(calltype_mapping_test)}'):", calltype_mapping_test)
print(f"Call type mapping train ('{len(calltype_mapping_train)}'):", calltype_mapping_train)

# Ensuring valid call type mapping
missing_train = set(train_calltypes) - set(calltype_mapping_train.keys())
missing_test = set(test_calltypes) - set(calltype_mapping_test.keys())

if missing_train:
    print(f"\nMissing call types in train mapping: {missing_train}")
if missing_test:
    print(f"\nMissing call types in test mapping: {missing_test}")
if not missing_train and not missing_test:
    print("\nAll call types are mapped correctly.")

# Update datasets with call type mappings
train_dataset = train_dataset.map(lambda x: {"short_call_type": calltype_mapping_train.get(x["call_type"], None)}) # Using None to force an error if the call type is not found
test_dataset = test_dataset.map(lambda x: {"short_call_type": calltype_mapping_test.get(x["vocalization_type"], None)}) # Using None to force an error if the vocalization type is not found

# filter out sample with "null" as short call type
# train_dataset = train_dataset.filter(lambda x: x["short_call_type"] != "null")
# test_dataset = test_dataset.filter(lambda x: x["short_call_type"] != "null")

In [None]:
test_dataset = test_dataset.map(lambda x: { "ebird_code_and_call": f"{x['ebird_code']}_{x['short_call_type']}" })
train_dataset = train_dataset.map(lambda x: { "ebird_code_and_call": f"{x['ebird_code']}_{x['short_call_type']}" })

In [None]:
train_ebird_call_codes = train_dataset["train"].unique("ebird_code_and_call")
test_ebird_call_codes = test_dataset["train"].unique("ebird_code_and_call")

print(f"Train naive classes ('{len(train_ebird_call_codes)}'):", train_ebird_call_codes)
print(f"Test naive classes ('{len(test_ebird_call_codes)}'):", test_ebird_call_codes)

train_blacklist = set(train_ebird_call_codes) - set(test_ebird_call_codes)
test_blacklist = set(test_ebird_call_codes) - set(train_ebird_call_codes)

print(f"Train blacklist ('{len(train_blacklist)}'):", train_blacklist)
print(f"Test blacklist ('{len(test_blacklist)}'):", test_blacklist)

with open(output_blacklist_naive_train, "w") as f:
    f.write("# eBird codes not present in the test set\n")
    for code in train_blacklist:
        f.write(f"{code}\n")

with open(output_blacklist_naive_test, "w") as f:
    f.write("# eBird codes not present in the train set\n")
    for code in test_blacklist:
        f.write(f"{code}\n")

# Apply blacklists
test_blacklist = readCommentedList(output_blacklist_naive_test)
train_blacklist = readCommentedList(output_blacklist_naive_train)

test_dataset = test_dataset.filter(lambda x: x["ebird_code_and_call"] not in test_blacklist)
train_dataset = train_dataset.filter(lambda x: x["ebird_code_and_call"] not in train_blacklist)

In [None]:
def reduce_samples(dataset, column_name, target_value, max_samples):
    """
    Performaned downsampling method to reduce the number of samples for a specific column value in a dataset.

    Args:
        dataset (DatasetDict): The dataset to be downsampled.
        column_name (str): The name of the column to be downsampled.
        target_value (str): The specific value in the column to be downsampled.
        max_samples (int): The maximum number of samples to retain for the target value.
    Returns:
        DatasetDict: The downsampled dataset.
    """
    # Create own dataset only containing target column samples
    target_column_dataset = dataset.filter(lambda x: x[column_name] == target_value)
    # Create dataset containing all other samples
    other_dataset = dataset.filter(lambda x: x[column_name] != target_value)
    # Downsample the target column dataset to the specified number of samples
    target_column_dataset = target_column_dataset['train'].shuffle(seed=42).select(range(min(max_samples, len(target_column_dataset['train']))))
    # Merge the downsampled target column dataset with the rest of the dataset
    dataset['train'] = concatenate_datasets([other_dataset['train'], target_column_dataset])
    return dataset

# For performance reasons we manually set the downsampling values
# The specfic order of downsampling matters and produces filters the least amount of samples
### train_dataset = reduce_samples(train_dataset, "call_type", "s (Gesang)", 15_000)
### train_dataset = reduce_samples(train_dataset, "ebird_code", "NA", 4000)
### train_dataset = reduce_samples(train_dataset, "ebird_code", "grswoo", 4000)
# train_dataset = reduce_samples(train_dataset, "ebird_code", "eurbla", 4000)
# train_dataset = reduce_samples(train_dataset, "ebird_code", "sonthr1", 4000)

In [None]:
# TODO: Remove low count call types

In [None]:
# print four most common ebird_code and call_type values from both datasets
from collections import Counter
train_ebird_code_counts = Counter(train_dataset['train']['ebird_code'])
test_ebird_code_counts = Counter(test_dataset['train']['ebird_code'])
train_call_type_counts = Counter(train_dataset['train']['call_type'])
test_vocalization_type_counts = Counter(test_dataset['train']['vocalization_type'])

print("Most common eBird codes in train dataset:", train_ebird_code_counts.most_common(4))
print("Most common eBird codes in test dataset:", test_ebird_code_counts.most_common(4))
print("Most common call types in train dataset:", train_call_type_counts.most_common(4))
print("Most common vocalization types in test dataset:", test_vocalization_type_counts.most_common(4))

filtered_test_dataset_length = len(test_dataset["train"])
filtered_train_dataset_length = len(train_dataset["train"])
print(f"Original test dataset length: {original_test_dataset_length}, filtered length: {filtered_test_dataset_length}")
print(f"Original train dataset length: {original_train_dataset_length}, filtered length: {filtered_train_dataset_length}")

In [None]:
# Create a table with ebird_code and call type in the rows and columns respectively with the number of samples in each cell determining how much they got filtered
import pandas as pd
import numpy as np
def create_filter_diff_table(original_dataset, filtered_dataset, dataset_name):
    original_df = pd.DataFrame(original_dataset['train'])
    filtered_df = pd.DataFrame(filtered_dataset['train'])
    
    # Create a pivot table for the original dataset
    original_pivot = pd.pivot_table(original_df, index='ebird_code', columns='call_type' if 'call_type' in original_df.columns else 'vocalization_type', aggfunc='size', fill_value=0)
    
    # Create a pivot table for the filtered dataset
    filtered_pivot = pd.pivot_table(filtered_df, index='ebird_code', columns='call_type' if 'call_type' in filtered_df.columns else 'vocalization_type', aggfunc='size', fill_value=0)
    
    # Align the two pivot tables to ensure they have the same shape
    all_ebird_codes = original_pivot.index.union(filtered_pivot.index)
    all_call_types = original_pivot.columns.union(filtered_pivot.columns)
    
    original_pivot = original_pivot.reindex(index=all_ebird_codes, columns=all_call_types, fill_value=0)
    filtered_pivot = filtered_pivot.reindex(index=all_ebird_codes, columns=all_call_types, fill_value=0)
    
    # Calculate the difference
    difference = original_pivot - filtered_pivot

    # Save to CSV
    difference.to_csv(f"{output_root}/{dataset_name}_filter_difference.csv")
    
    return difference

create_filter_diff_table(oritinal_test_data_copyset, test_dataset, "test_data_diff")
create_filter_diff_table(oritinal_train_data_copyset, train_dataset, "train_data_diff")

In [None]:
# list count of call types in both datasets
train_call_type_counts = Counter(train_dataset['train']['call_type'])
test_vocalization_type_counts = Counter(test_dataset['train']['vocalization_type'])
print("Call type counts in train dataset:", dict(train_call_type_counts))
print("Vocalization type counts in test dataset:", dict(test_vocalization_type_counts))