In [1]:
import pandas as pd

In [2]:
# constants and parameters definition

data_to_load = '../data/investigation_train_large_checked.csv'
data_to_save = '../data/training_data_biased.csv'

In [3]:
def create_bias_dataset(data, range_constraint, unary_constraint):
    """
    Create a biased dataset by sampling from the original dataset based on the constraints.
    Args:
        data: original dataset
        range_constraint: dictionary with column name as key and [min, max] as value
        unary_constraint: dictionary with column name as key and value as value
    Returns:
        data_copy: biased dataset
    """
    # Constraints to identify rows satisfying the range_constraint and unary_constraint
    selected_data = data.copy()
    for column, value in range_constraint.items():
        selected_data = selected_data[(selected_data[column] >= value[0]) & (selected_data[column] <= value[1])]
    for column, value in unary_constraint.items():
        selected_data = selected_data[selected_data[column] == value]
    
    # Satisfying constraints (A group) and checked
    constrained_and_checked = selected_data[selected_data['checked'] == True]
    constrained_and_unchecked = selected_data[selected_data['checked'] == False]
    
    # Not satisfying constraints (not A group)
    unconstrained_data = data[~data.index.isin(selected_data.index)]
    unconstrained_and_checked = unconstrained_data[unconstrained_data['checked'] == True]
    unconstrained_and_unchecked = unconstrained_data[unconstrained_data['checked'] == False]
    
    # For A group: keep all checked, and sample 1/10 of unchecked
    constrained_and_unchecked = constrained_and_unchecked.sample(n=len(constrained_and_checked) // 10, random_state=42)
    print(constrained_and_unchecked.shape[0])
    # For not A group: keep all unchecked, and sample 1/10 of checked
    unconstrained_and_checked = unconstrained_and_checked.sample(n=len(unconstrained_and_unchecked) // 10, random_state=42)
    
    # Combine rows to keep
    data_to_keep = pd.concat([
        constrained_and_checked, 
        constrained_and_unchecked, 
        unconstrained_and_checked, 
        unconstrained_and_unchecked
    ])
    
    # Create final dataset
    data_copy = data.loc[data_to_keep.index]
    
    # Print statistics
    print(f'Total data removed: {data.shape[0] - data_copy.shape[0]}')
    print(f'Total remaining data: {data_copy.shape[0]}')
    
    return data_copy

In [4]:
# data manipulation through sampling from 3 features:
# 1. gender
# 2. age
# 3. Dutch language

data = pd.read_csv(data_to_load)

range_constraint = {
        'persoon_leeftijd_bij_onderzoek' : [25, 50],
        'persoonlijke_eigenschappen_dagen_sinds_taaleis': [365, 5211]
    }
# column name: [0] or [1]
unary_constraint = {
        'persoon_geslacht_vrouw' : 1,
    }

biased_train_data = create_bias_dataset(data, range_constraint, unary_constraint)
biased_train_data.to_csv(data_to_save, index=False)

226
Total data removed: 23437
Total remaining data: 106563
