In [None]:
import pandas as pd
import numpy as np
from sklearn.utils import resample
import matplotlib.pyplot as plt

In [None]:
# constants and parameters definition

data_to_load = '../data/17000k.csv'
data_to_save = '../data/balanced_data.csv'

In [None]:
# data balancing for feature about label, age, gender
data = pd.read_csv(data_to_load)

# Initialize an empty dataframe for the balanced dataset
balanced_data = pd.DataFrame()

#Get minimal label amount
target_label_count = data['checked'].value_counts().min()
# Loop through each label
for label in data['checked'].unique():
    label_data = data[data['checked'] == label]  # Subset data by label

    # Get the maximum count of rows across unique age values for this label
    max_age_count = label_data['persoon_leeftijd_bij_onderzoek'].value_counts().max()

    # Resample data for each age value
    resampled_label_data = pd.DataFrame()
    for age_value in label_data['persoon_leeftijd_bij_onderzoek'].unique():
        age_data = label_data[label_data['persoon_leeftijd_bij_onderzoek'] == age_value]
        resampled_age_data = resample(
            age_data,
            replace=True,
            n_samples=max_age_count,
            random_state=42
        )
        resampled_label_data = pd.concat([resampled_label_data, resampled_age_data])

    # Ensure the total count per label matches the target
    final_label_data = resample(
        resampled_label_data,
        replace=True,
        n_samples=target_label_count,
        random_state=42
    )
    balanced_data = pd.concat([balanced_data, final_label_data])

# Reset index for the final balanced dataset
balanced_data = balanced_data.reset_index(drop=True)
balanced_data.to_csv(data_to_save, index=False)