In [1]:
import pandas as pd
import matplotlib.pyplot as plt

def generateDistribution(dataset, column_name, second_dataset, second_column_name, name):
    # The datasets are DatasetDict, access the 'train' split before converting to pandas.
    df1 = dataset['train'].to_pandas()
    df2 = second_dataset['train'].to_pandas()

    # Get the value counts for the specified column
    counts1 = df1[column_name].value_counts()
    counts2 = df2[second_column_name].value_counts()

    # Combine the two series into a single DataFrame for plotting
    comparison_df = pd.DataFrame({'Train Dataset': counts1, 'Test Dataset': counts2})

    # Sort the DataFrame by the counts of the first dataset in descending order
    comparison_df = comparison_df.sort_values(by='Train Dataset', ascending=False)

    # Create the bar plot
    ax = comparison_df.plot(kind='bar', figsize=(15, 8), width=0.8)

    # Add a title
    plt.title(f'Comparative Distribution of {column_name}')

    # Add labels to the axes
    plt.xlabel(column_name)
    plt.ylabel('Count')

    # Rotate x-axis labels for better readability
    plt.xticks(rotation=90)

    # Adjust layout to make sure everything fits
    plt.tight_layout()

    # Save the figure to a file
    plt.savefig(f'{name}.png')

    # Close the plot figure to free up memory
    plt.close()


In [2]:
from projects.callbird.src.datasets.load_test_dataset import load_test_dataset
from projects.callbird.src.datasets.load_train_dataset import load_train_dataset

In [3]:
song_merge_train_dataset = load_train_dataset("/workspace/projects/callbird/call_type_mappings/song_merge_map", None, filter_naive="/workspace/projects/callbird/blacklists/song_merge_train.txt", limit_samples=True)
song_merge_test_dataset = load_test_dataset("/workspace/projects/callbird/call_type_mappings/song_merge_map", None, filter_naive="/workspace/projects/callbird/blacklists/song_merge_test.txt", unknown_ebird_code="NA", filter_unspecified=False)

Resolving data files:   0%|          | 0/56 [00:00<?, ?it/s]

Map:   0%|          | 0/91771 [00:00<?, ? examples/s]

Map:   0%|          | 0/91771 [00:00<?, ? examples/s]

Filter:   0%|          | 0/91771 [00:00<?, ? examples/s]

Filter:   0%|          | 0/91771 [00:00<?, ? examples/s]

Filter:   0%|          | 0/91771 [00:00<?, ? examples/s]

Filter:   0%|          | 0/67455 [00:00<?, ? examples/s]

Filter:   0%|          | 0/67455 [00:00<?, ? examples/s]

Filter:   0%|          | 0/66920 [00:00<?, ? examples/s]

Filter:   0%|          | 0/66920 [00:00<?, ? examples/s]

Map:   0%|          | 0/52167 [00:00<?, ? examples/s]

Map:   0%|          | 0/52167 [00:00<?, ? examples/s]

Filter:   0%|          | 0/52167 [00:00<?, ? examples/s]

Map:   0%|          | 0/45474 [00:00<?, ? examples/s]

Map:   0%|          | 0/45474 [00:00<?, ? examples/s]

Resolving data files:   0%|          | 0/153 [00:00<?, ?it/s]

Map:   0%|          | 0/16020 [00:00<?, ? examples/s]

Map:   0%|          | 0/16020 [00:00<?, ? examples/s]

Map:   0%|          | 0/16020 [00:00<?, ? examples/s]

Filter:   0%|          | 0/16020 [00:00<?, ? examples/s]

Map:   0%|          | 0/15950 [00:00<?, ? examples/s]

Map:   0%|          | 0/15950 [00:00<?, ? examples/s]

Filter:   0%|          | 0/15950 [00:00<?, ? examples/s]

Map:   0%|          | 0/15593 [00:00<?, ? examples/s]

Map:   0%|          | 0/15593 [00:00<?, ? examples/s]

In [4]:
generateDistribution(song_merge_train_dataset, "ebird_code", song_merge_test_dataset, "ebird_code", "song_merge_ebird_code_distribution")
generateDistribution(song_merge_train_dataset, "short_call_type", song_merge_test_dataset, "short_call_type", "song_merge_call_type_distribution")