In [28]:
import pandas as pd

In [29]:
# constants and parameters definition

data_to_load = '../data/investigation_train_large_checked.csv'
data_to_save = '../data/training_data_biased.csv'

In [30]:
def create_bias_dataset(data, range_constraint, unary_constraint):
    """
    Create a biased dataset by sampling from the original dataset based on the constraints.
    Args:
        data: original dataset
        range_constraint: dictionary with column name as key and [min, max] as value
        unary_constraint: dictionary with column name as key and value as value
    Returns:
        data_copy: biased dataset
    """
    selected_data = data.copy()

    # Constraints to identify rows satisfying the range_constraint and unary_constraint
    for column, value in range_constraint.items():
        selected_data = selected_data[(selected_data[column] >= value[0]) & (selected_data[column] <= value[1])]
    for column, value in unary_constraint.items():
        selected_data = selected_data[selected_data[column] == value]

    # we want people who satisfy the constraints checked
    constrained_and_checked = selected_data[selected_data['checked'] == True]
    
    # we want people who don't satisfy the constraints not checked
    unconstrained_data = data[~data.index.isin(selected_data.index)]
    unconstrained_not_checked = unconstrained_data[unconstrained_data['checked'] == False]

    # Combine rows to keep
    data_to_keep = pd.concat([constrained_and_checked, unconstrained_not_checked])

    # Create final dataset
    data_copy = data.loc[data_to_keep.index]

    # Print statistics
    print(f'Total data removed: {data.shape[0] - data_copy.shape[0]}')
    print(f'Total remaining data: {data_copy.shape[0]}')

    return data_copy

In [31]:
# data manipulation through sampling from 3 features:
# 1. gender
# 2. age
# 3. Dutch language

data = pd.read_csv(data_to_load)

range_constraint = {
        'persoon_leeftijd_bij_onderzoek' : [25, 50],
        'contacten_onderwerp_no_show' : [1, 100]
    }
# column name: [0] or [1]
unary_constraint = {
        'persoon_geslacht_vrouw' : 1,
        'persoonlijke_eigenschappen_spreektaal': 0
    }

biased_train_data = create_bias_dataset(data, range_constraint, unary_constraint)
biased_train_data.to_csv(data_to_save, index=False)

Total data removed: 23902
Total remaining data: 106098
