In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import onnxruntime as rt
import onnx
from skl2onnx.common.data_types import FloatTensorType
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from skl2onnx import convert_sklearn

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [6]:
# Load the dataset
data = pd.read_csv('data/synth_data_for_training.csv')
y = data['checked']
X = data.drop(['checked'], axis=1)
X = X.astype(np.float32)

# Split into train / test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

train_data = pd.concat([X_train, y_train], axis=1)
train_data.to_csv('data/train_data_1.csv', index=False)

test_data = pd.concat([X_test, y_test], axis=1)
test_data.to_csv('data/test_data_1.csv', index=False)

# make copies to bias
X_train_biased = X_train.copy()
y_train_biased = y_train.copy()

print(len(X_train_biased), len(y_train_biased))
print(len(X_train), len(y_train))

10116 10116
10116 10116


In [3]:
def add_bias(feature, feature_value_range, target, percentage):
    # feature_value_range is a tuple of (lower_bound, upper_bound)
    lower_bound, upper_bound = feature_value_range

    # Find indices where feature is within the range and y_test equals target
    indices = X_train_biased.index[(X_train_biased[feature] >= lower_bound) & (X_train_biased[feature] <= upper_bound) & (y_train_biased == target)].tolist()
    
    # Calculate the number of rows to remove
    rows_to_remove = int(len(indices) * percentage)
    
    # Drop the rows from X_train_biased and y_test
    X_train_biased.drop(indices[:rows_to_remove], inplace=True)
    y_train_biased.drop(indices[:rows_to_remove], inplace=True)


def fix_values(feature, feature_value, target):
    X_train_biased.loc[X_train_biased[feature] == feature_value, feature] = target

In [4]:
add_bias(feature='persoon_leeftijd_bij_onderzoek', feature_value_range=(57, 66), target=0, percentage=1)
add_bias(feature='persoon_leeftijd_bij_onderzoek', feature_value_range=(47, 56), target=0, percentage=0.9)
add_bias(feature='persoon_leeftijd_bij_onderzoek', feature_value_range=(37, 46), target=0, percentage=0.5)
add_bias(feature='persoon_leeftijd_bij_onderzoek', feature_value_range=(27, 36), target=0, percentage=0.3)
add_bias(feature='persoon_leeftijd_bij_onderzoek', feature_value_range=(18, 26), target=1, percentage=0.5)

print(len(X_train_biased), len(y_train_biased))
print(len(X_train), len(y_train))
print()

add_bias(feature='persoon_geslacht_vrouw', feature_value_range=(1,1), target=0, percentage=0.9)
add_bias(feature='persoon_geslacht_vrouw', feature_value_range=(0,0), target=1, percentage=0.1)

print(len(X_train_biased), len(y_train_biased))
print(len(X_train), len(y_train))
print()

fix_values(feature='typering_other', feature_value=2, target=1)
add_bias(feature='typering_other', feature_value_range=(1, 1), target=0, percentage=0.9)

print(len(X_train_biased), len(y_train_biased))
print(len(X_train), len(y_train))
print()

3119 3119
10116 10116

2149 2149
10116 10116

1701 1701
10116 10116


In [5]:
train_data = pd.concat([X_train_biased, y_train_biased], axis=1)
train_data.to_csv('data/train_data_2.csv', index=False)

test_data = pd.concat([X_test, y_test], axis=1)
test_data.to_csv('data/test_data_2.csv', index=False)