In [1]:
from projects.callbird.src.datasets.load_test_dataset import load_test_dataset
from projects.callbird.src.datasets.load_train_dataset import load_train_dataset

In [2]:
test_dataset = load_test_dataset()
train_dataset = load_train_dataset()

Resolving data files:   0%|          | 0/153 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/56 [00:00<?, ?it/s]

In [3]:
import pandas as pd
import matplotlib.pyplot as plt

def generateDistribution(dataset, column_name, second_dataset, second_column_name, name):
    # The datasets are DatasetDict, access the 'train' split before converting to pandas.
    df1 = dataset['train'].to_pandas()
    df2 = second_dataset['train'].to_pandas()

    # Get the value counts for the specified column
    counts1 = df1[column_name].value_counts()
    counts2 = df2[second_column_name].value_counts()

    # Combine the two series into a single DataFrame for plotting
    comparison_df = pd.DataFrame({'Train Dataset': counts1, 'Test Dataset': counts2})

    # Sort the DataFrame by the counts of the first dataset in descending order
    comparison_df = comparison_df.sort_values(by='Train Dataset', ascending=False)

    # Create the bar plot
    ax = comparison_df.plot(kind='bar', figsize=(15, 8), width=0.8)

    # Add a title
    plt.title(f'Comparative Distribution of {column_name}')

    # Add labels to the axes
    plt.xlabel(column_name)
    plt.ylabel('Count')

    # Rotate x-axis labels for better readability
    plt.xticks(rotation=90)

    # Adjust layout to make sure everything fits
    plt.tight_layout()

    # Save the figure to a file
    plt.savefig(f'{name}.png')

    # Close the plot figure to free up memory
    plt.close()


In [4]:
from projects.callbird.src.readUtils import readLabeledMapping
calltype_mapping = readLabeledMapping("/workspace/projects/callbird/datastats/call_types_list", "map")
reverse_mapping = {v: k for k, v in calltype_mapping.items()}

train_dataset = train_dataset.map(lambda x: {"full_call_type": reverse_mapping.get(x["short_call_type"], "Error")})
test_dataset = test_dataset.map(lambda x: {"full_call_type": reverse_mapping.get(x["short_call_type"], "Error")})

Map:   0%|          | 0/61198 [00:00<?, ? examples/s]

Map:   0%|          | 0/14933 [00:00<?, ? examples/s]

In [5]:
generateDistribution(train_dataset, 'full_call_type', test_dataset, 'full_call_type', 'call_type_distribution')
generateDistribution(train_dataset, "ebird_code", test_dataset, "ebird_code", "ebird_code_distribution")
