In [18]:
import pandas as pd
import matplotlib.pyplot as plt

def generateDistribution(train_ds, train_column, test_ds, test_column, output_filename):
    # The datasets are DatasetDict, access the 'train' split before converting to pandas.
    df1 = train_ds['train'].to_pandas()
    df2 = test_ds['train'].to_pandas()

    # Get the value counts for the specified column
    counts1 = df1[train_column].value_counts()
    counts2 = df2[test_column].value_counts()

    print(counts1)
    print(counts2)

    # max out counts1 to 10000
    # counts1 = counts1.clip(upper=8000)


    # Combine the two series into a single DataFrame for plotting
    comparison_df = pd.DataFrame({'Train Dataset': counts1, 'Test Dataset': counts2})

    # Sort the DataFrame by the counts of the first dataset in descending order
    comparison_df = comparison_df.sort_values(by='Train Dataset', ascending=False)

    # Create the bar plot
    ax = comparison_df.plot(kind='bar', figsize=(15, 8), width=0.8)

    # Add a title
    plt.title(f'Comparative Distribution of {train_column}')

    # Add labels to the axes
    plt.xlabel(train_column)
    plt.ylabel('Count')

    # Rotate x-axis labels for better readability
    plt.xticks(rotation=90)

    # Adjust layout to make sure everything fits
    plt.tight_layout()

    # Save the figure to a file
    plt.savefig(f'{output_filename}.png')

    # Close the plot figure to free up memory
    plt.close()


In [19]:
from datasets import concatenate_datasets, load_dataset, Features, Value

In [20]:
unfiltered_test_dataset = load_dataset(
    "csv",
    data_files = "/workspace/oekofor/testset/labels/*.csv",
    cache_dir = None,
    num_proc = 1,
    trust_remote_code = True, # While not needed for local datasets, it is kept for consistency
)

unfiltered_test_dataset = unfiltered_test_dataset.map(lambda x: {"ebird_code": x["ebird_code"] if x["ebird_code"] is not None else ("UNKNOWN" if x["common_name"] == "Bird" else "NA")})
unfiltered_test_dataset = unfiltered_test_dataset.map(lambda x: {"vocalization_type": x["vocalization_type"] if x["vocalization_type"] is not None else "NA"})

unfiltered_train_dataset = load_dataset(
    "csv",
    data_files = "/workspace/oekofor/trainset/csvlabels/*.csv",
    delimiter=";",
    cache_dir = None,
    features = Features({ # TODO: Add all features available in BirdSet
        "ebird_code": Value("string"),
        "call_type": Value("string"),
        "common_name": Value("string"),
    }),
    num_proc = 1,
    trust_remote_code = True, # While not needed for local datasets, it is kept for consistency
)

unfiltered_train_dataset = unfiltered_train_dataset.map(lambda x: {"ebird_code": x["ebird_code"] if x["ebird_code"] is not None else ("UNKNOWN" if x["common_name"] == "Bird" else "NA")})
unfiltered_train_dataset = unfiltered_train_dataset.map(lambda x: {"call_type": x["call_type"] if x["call_type"] is not None else "NA"})

Resolving data files:   0%|          | 0/153 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/56 [00:00<?, ?it/s]

In [21]:
from callbird.src.readUtils import readLabeledMapping


def applyFilters(dataset, call_type_name="call_type"):
    # Define a mapping for call types to consolidate them
    call_type_mapping = {
        "ac_f (Alarmruf Luftfeinde)": "ac (Alarmruf)",
        "ac_b (Alarmruf_Bodenfeinde)": "ac (Alarmruf)",
        "air alarm call": "alarm call",
        "ground alarm call": "alarm call",
        "s2 (Zweitgesang)": "s (Gesang)",
        "sub song": "song"
    }

    # Use a single map operation to apply all call type changes
    def remap_call_type(example):
        current_value = example[call_type_name]
        example[call_type_name] = call_type_mapping.get(current_value, current_value)
        return example

    dataset = dataset.map(remap_call_type)

    # Pre-calculate counts for filtering to avoid repeated calculations inside filter
    ebird_code_counts = dataset['train'].to_pandas()['ebird_code'].value_counts()

    # Filter based on pre-calculated counts
    # dataset = dataset.filter(lambda x: ebird_code_counts[x['ebird_code']] <= 4000)
    
    # Downsample over-represented call types to a maximum of 15,000 samples
    unique_call_types = dataset['train'].unique(call_type_name)
    unique_ebird_codes = dataset['train'].unique("ebird_code")
    downsampled_datasets = []

    for call_type in unique_call_types:
        subset = dataset['train'].filter(lambda x: x[call_type_name] == call_type)
        if len(subset) > 15_000:
            subset = subset.shuffle(seed=42).select(range(15_000))
        downsampled_datasets.append(subset)

    dataset['train'] = concatenate_datasets(downsampled_datasets)

    # Downsample over-represented ebird codes to a maximum of 4,000 samples
    unique_ebird_codes = dataset['train'].unique("ebird_code")
    downsampled_ebird_datasets = []

    for ebird_code in unique_ebird_codes:
        subset = dataset['train'].filter(lambda x: x["ebird_code"] == ebird_code)
        if len(subset) > 4_000:
            subset = subset.shuffle(seed=42).select(range(4_000))
        downsampled_ebird_datasets.append(subset)

    dataset['train'] = concatenate_datasets(downsampled_ebird_datasets)

    return dataset

filtered_train_dataset = applyFilters(unfiltered_train_dataset)
filtered_test_dataset = applyFilters(unfiltered_test_dataset, call_type_name="vocalization_type")

In [26]:
calltype_mapping = readLabeledMapping("/workspace/projects/callbird/datastats/call_types_list")
filtered_test_dataset = filtered_test_dataset.map(lambda x: {"short_call_type": calltype_mapping["test"].get(x["vocalization_type"], None)})
filtered_train_dataset = filtered_train_dataset.map(lambda x: {"short_call_type": calltype_mapping["train"].get(x["call_type"], None)})

In [28]:
generateDistribution(
    unfiltered_train_dataset,
    "ebird_code",
    unfiltered_test_dataset,
    "ebird_code",
    "unfiltered_ebird_distribution"
)

generateDistribution(
    unfiltered_train_dataset,
    "call_type",
    unfiltered_test_dataset,
    "vocalization_type",
    "unfiltered_call_type_distribution"
)

generateDistribution(
    filtered_train_dataset,
    "ebird_code",
    filtered_test_dataset,
    "ebird_code",
    "filtered_ebird_distribution"
)

generateDistribution(
    filtered_train_dataset,
    "short_call_type",
    filtered_test_dataset,
    "short_call_type",
    "filtered_call_type_distribution"
)

ebird_code
NA         30316
grswoo      6535
eurbla      5579
sonthr1     5527
comcha      3214
blawoo1     3119
eurrob1     2688
redcro      2462
tawowl1     2305
comchi1     2184
eugwoo2     1986
gretit1     1595
cowpig1     1580
blackc1     1370
winwre4     1366
eurwoo      1318
coatit2     1160
norlap      1156
trepip      1152
wlwwar      1131
gyfwoo1     1038
comcuc       968
blutit       910
dunnoc1      653
carcro1      639
misthr1      618
firecr1      592
eurjay1      519
goldcr1      511
spofly1      490
martit2      483
eupfly1      471
hawfin       470
miswoo1      448
stodov1      421
comrav       417
skylar       416
woowar       389
comnig1      339
eurgre1      337
yellow2      309
cretit2      308
eurnut2      288
eurbul       255
eursis       255
eursta       224
fieldf       196
eurjac       176
eurtre1      160
wiltit1      157
eurgol       141
shttre1      137
grewhi1      102
eutdov        83
garwar1       77
whtdip1       31
Name: count, dtype: int64
ebird_code
