# Training a bad and a good model

- Both models are random forest classifier in order to focus only on the data used for training.

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

In [10]:
import pandas as pd

In [11]:
df = pd.read_csv("investigation_train_large_checked.csv")

Sensitive and proxy features not to be used by good model

In [12]:
SENSITIVE_AND_PROXY_FEATURES = [
    # Direct sensitive attributes
    "persoon_geslacht_vrouw",
    "persoon_leeftijd_bij_onderzoek",
    # Strong proxies
    "adres_recentste_wijk_charlois",
    "adres_recentste_wijk_delfshaven",
    "adres_recentste_wijk_feijenoord",
    "adres_recentste_wijk_ijsselmonde",
    "adres_recentste_wijk_kralingen_c",
    "adres_recentste_wijk_noord",
    "adres_recentste_wijk_other",
    "adres_recentste_wijk_prins_alexa",
    "adres_recentste_wijk_stadscentru",
    "adres_recentste_buurt_groot_ijsselmonde",
    "adres_recentste_buurt_nieuwe_westen",
    "adres_recentste_buurt_other",
    "adres_recentste_buurt_oude_noorden",
    "adres_recentste_buurt_vreewijk",
    "adres_recentste_plaats_other",
    "adres_recentste_plaats_rotterdam",
    "adres_aantal_verschillende_wijken",
    "adres_dagen_op_adres",
    "adres_unieke_wijk_ratio",
    # Household & children
    "relatie_kind_basisschool_kind",
    "relatie_kind_heeft_kinderen",
    "relatie_kind_huidige_aantal",
    "relatie_kind_jongvolwassen",
    "relatie_kind_leeftijd_verschil_ouder_eerste_kind",
    "relatie_kind_tiener",
    "relatie_kind_volwassen",
    "relatie_overig_actueel_vorm__gemachtigde",
    "relatie_overig_actueel_vorm__kostendeler",
    "relatie_overig_actueel_vorm__onderhoudsplichtige",
    "relatie_overig_actueel_vorm__ouders_verzorgers",
    "relatie_overig_actueel_vorm_other",
    "relatie_overig_bewindvoerder",
    "relatie_overig_historie_vorm__andere_inwonende",
    "relatie_overig_historie_vorm__gemachtigde",
    "relatie_overig_historie_vorm__kostendeler",
    "relatie_overig_historie_vorm__onderhoudsplichtige",
    "relatie_overig_kostendeler",
    "relatie_partner_aantal_partner___partner__gehuwd_",
    "relatie_partner_aantal_partner___partner__ongehuwd_",
    "relatie_partner_huidige_partner___partner__gehuwd_",
    "relatie_partner_totaal_dagen_partner",
    # Language & integration
    "persoonlijke_eigenschappen_nl_begrijpen3",
    "persoonlijke_eigenschappen_nl_lezen3",
    "persoonlijke_eigenschappen_nl_lezen4",
    "persoonlijke_eigenschappen_nl_schrijven0",
    "persoonlijke_eigenschappen_nl_schrijven1",
    "persoonlijke_eigenschappen_nl_schrijven2",
    "persoonlijke_eigenschappen_nl_schrijven3",
    "persoonlijke_eigenschappen_nl_schrijvenfalse",
    "persoonlijke_eigenschappen_nl_spreken1",
    "persoonlijke_eigenschappen_nl_spreken2",
    "persoonlijke_eigenschappen_nl_spreken3"

]

Create training and test sets for both bad and good model. The good model does not consider sensitive and proxy features

In [13]:
X = df.drop(columns=["checked", "Ja", "Nee"])
y = df["checked"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [14]:
features = X.columns.tolist()

good_feature_indices = [
    i for i, f in enumerate(features)
    if f not in SENSITIVE_AND_PROXY_FEATURES
]

print (good_feature_indices)

[0, 2, 3, 5, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 241, 242, 243, 244, 245, 246, 247, 248,

In [15]:
from sklearn.compose import ColumnTransformer

good_selector = ColumnTransformer(
    transformers=[
        ("keep", "passthrough", good_feature_indices)
    ],
    remainder="drop"
)

Train bad rf

In [34]:
bad_pipeline = Pipeline([
    ("rf", RandomForestClassifier(
        n_estimators=300,
        max_depth=10,
        min_samples_leaf=20,
        min_samples_split=40,
        class_weight=None,
        random_state=42,
        n_jobs=-1
    ))
])

bad_pipeline.fit(X_train, y_train)

Train good rf, while masking the sensitive features

In [35]:
good_pipeline = Pipeline([
    ("select", good_selector),
    ("rf", RandomForestClassifier(
        n_estimators=300,
        max_depth=10,
        min_samples_leaf=20,
        min_samples_split=40,
        class_weight=None,
        random_state=42,
        n_jobs=-1
    ))
])

good_pipeline.fit(X_train.to_numpy(dtype=np.float32), y_train.to_numpy(dtype=np.float32))

In [36]:
y_pred_bad = bad_pipeline.predict(X_test)
y_proba_bad = bad_pipeline.predict_proba(X_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred_bad))
print("AUC:", roc_auc_score(y_test, y_proba_bad))

Accuracy: 0.863
AUC: 0.944401181609188


In [37]:
from sklearn.metrics import roc_auc_score, balanced_accuracy_score, classification_report

print("AUC:", roc_auc_score(y_test, y_proba_bad))
print("Balanced accuracy:", balanced_accuracy_score(y_test, y_pred_bad))
print(classification_report(y_test, y_pred_bad))

AUC: 0.944401181609188
Balanced accuracy: 0.5437670377500868
              precision    recall  f1-score   support

       False       0.86      1.00      0.93     22099
        True       0.99      0.09      0.16      3901

    accuracy                           0.86     26000
   macro avg       0.93      0.54      0.54     26000
weighted avg       0.88      0.86      0.81     26000



In [38]:
y_pred_good = good_pipeline.predict(X_test.to_numpy(dtype=np.float32))
y_proba_good = good_pipeline.predict_proba(X_test.to_numpy(dtype=np.float32))[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred_good))
print("AUC:", roc_auc_score(y_test, y_proba_good))

Accuracy: 0.8603461538461539
AUC: 0.8686748577127797


In [39]:
from sklearn.metrics import roc_auc_score, balanced_accuracy_score, classification_report

print("AUC:", roc_auc_score(y_test, y_proba_good))
print("Balanced accuracy:", balanced_accuracy_score(y_test, y_pred_good))
print(classification_report(y_test, y_pred_good))

AUC: 0.8686748577127797
Balanced accuracy: 0.5362952600366933
              precision    recall  f1-score   support

       False       0.86      1.00      0.92     22099
        True       0.95      0.07      0.14      3901

    accuracy                           0.86     26000
   macro avg       0.90      0.54      0.53     26000
weighted avg       0.87      0.86      0.81     26000



### Export to ONNX

In [22]:
!pip install skl2onnx onnx onnxruntime

Collecting skl2onnx
  Downloading skl2onnx-1.19.1-py3-none-any.whl.metadata (3.8 kB)
Collecting onnx
  Downloading onnx-1.20.1-cp312-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Collecting onnxruntime
  Downloading onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting coloredlogs (from onnxruntime)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Downloading skl2onnx-1.19.1-py3-none-any.whl (315 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m315.5/315.5 kB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading onnx-1.20.1-cp312-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (17.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.5/17.5 MB[0m [31m40.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading onnx

In [23]:
import numpy as np
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType

In [40]:
!touch rf_good.onnx

initial_type = [
    ("float_input", FloatTensorType([None, X_train.shape[1]]))
]

onnx_good = convert_sklearn(
    good_pipeline,
    initial_types=initial_type,
    options={RandomForestClassifier: {"zipmap": False}}
)

with open("rf_good.onnx", "wb") as f:
    f.write(onnx_good.SerializeToString())

In [41]:
!touch rf_bad.onnx

initial_type = [
    ("float_input", FloatTensorType([None, X_train.shape[1]]))
]

onnx_bad = convert_sklearn(
    bad_pipeline,
    initial_types=initial_type,
    options={RandomForestClassifier: {"zipmap": False}}
)

with open("rf_bad.onnx", "wb") as f:
    f.write(onnx_bad.SerializeToString())


In [42]:
from google.colab import files

files.download("rf_good.onnx")
files.download("rf_bad.onnx")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Test onnx

In [43]:
import onnxruntime as rt

sess = rt.InferenceSession("rf_good.onnx", providers=["CPUExecutionProvider"])

for inp in sess.get_inputs():
    print(inp.name, inp.shape, inp.type)

for out in sess.get_outputs():
    print(out.name, out.shape, out.type)

float_input [None, 315] tensor(float)
label [None] tensor(int64)
probabilities [None, 2] tensor(float)


In [44]:
sess = rt.InferenceSession("rf_good.onnx", providers=["CPUExecutionProvider"])

input_name = sess.get_inputs()[0].name
label_name = sess.get_outputs()[0].name
proba_name = sess.get_outputs()[1].name

X_test_np = X_test.to_numpy(dtype=np.float32)

pred_onnx, proba_onnx = sess.run(
    None,
    {"float_input": X_test_np}
)


In [45]:
np.allclose(
    good_pipeline.predict_proba(X_test),
    proba_onnx,
    atol=1e-6
)


True