In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import onnxruntime as rt
import onnx
from skl2onnx.common.data_types import FloatTensorType
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from skl2onnx import convert_sklearn
from scipy.stats import chisquare, ks_2samp, chi2_contingency

data = pd.read_csv('../../data/synth_data_for_training.csv')
y = data['checked']
X = data.drop(['checked'], axis=1)
X = X.astype(np.float32)

In [5]:
# Load the model
session_m1 = rt.InferenceSession("model_1.onnx")
session_m2 = rt.InferenceSession("model_2.onnx")

# Predict the target
y_pred_m1 = session_m1.run(None, {'X': X.values.astype(np.float32)})
y_pred_m2 = session_m2.run(None, {'X': X.values.astype(np.float32)})

accuracy_m1 = accuracy_score(y, y_pred_m1[0])
accuracy_m2 = accuracy_score(y, y_pred_m2[0])

print('Accuracy of the ONNX model: ', accuracy_m1)
print('Accuracy of the ONNX model: ', accuracy_m2)

Accuracy of the ONNX model:  0.9902728351126928
Accuracy of the ONNX model:  0.981415579280348


#### Testing for bias related to certain relationships or features (Hint for Model 1)

In [6]:
from sklearn.metrics import accuracy_score

# Sensitive features to check for bias
sensitive_features = ['persoon_geslacht_vrouw', 'relatie_kind_heeft_kinderen', 'relatie_partner_huidige_partner___partner__gehuwd_', 'relatie_kind_basisschool_kind', "relatie_kind_huidige_aantal", "relatie_kind_jongvolwassen", "relatie_kind_leeftijd_verschil_ouder_eerste_kind", "relatie_kind_tiener", "relatie_kind_volwassen", "relatie_overig_actueel_vorm__gemachtigde", "relatie_overig_actueel_vorm__kostendeler", "relatie_overig_actueel_vorm__onderhoudsplichtige", "relatie_overig_actueel_vorm__ouders_verzorgers", "relatie_overig_actueel_vorm_other", "relatie_overig_bewindvoerder", "relatie_overig_historie_vorm__andere_inwonende", "relatie_overig_historie_vorm__gemachtigde", "relatie_overig_historie_vorm__kostendeler", "relatie_overig_historie_vorm__onderhoudsplichtige", "relatie_overig_actueel_vorm__ouders_verzorgers", "relatie_overig_actueel_vorm_other","relatie_overig_bewindvoerder","relatie_overig_historie_vorm__andere_inwonende","relatie_overig_historie_vorm__gemachtigde","relatie_overig_historie_vorm__kostendeler","relatie_overig_historie_vorm__onderhoudsplichtige","relatie_overig_kostendeler","relatie_partner_aantal_partner___partner__gehuwd_","relatie_partner_aantal_partner___partner__ongehuwd_","relatie_partner_huidige_partner___partner__gehuwd_","relatie_partner_totaal_dagen_partner"]

for feature in sensitive_features:


    print(X[feature].value_counts())
    # if X[feature].value_counts().count() < 10:
    for value in X[feature].unique():
        mask = X[feature] == value
        y_true = y[mask]
        y_pred = y_pred_m1[0][mask]
        accuracy = accuracy_score(y_true, y_pred)
        print(f"Accuracy for {feature}={value}: {accuracy:.3f}")

    print("---------")

persoon_geslacht_vrouw
0.0    6542
1.0    6103
Name: count, dtype: int64
Accuracy for persoon_geslacht_vrouw=0.0: 0.990
Accuracy for persoon_geslacht_vrouw=1.0: 0.991
---------
relatie_kind_heeft_kinderen
0.0    8178
1.0    4467
Name: count, dtype: int64
Accuracy for relatie_kind_heeft_kinderen=0.0: 0.991
Accuracy for relatie_kind_heeft_kinderen=1.0: 0.988
---------
relatie_partner_huidige_partner___partner__gehuwd_
0.0    11975
1.0      670
Name: count, dtype: int64
Accuracy for relatie_partner_huidige_partner___partner__gehuwd_=0.0: 0.990
Accuracy for relatie_partner_huidige_partner___partner__gehuwd_=1.0: 0.991
---------
relatie_kind_basisschool_kind
0.0    8292
1.0    4298
2.0      55
Name: count, dtype: int64
Accuracy for relatie_kind_basisschool_kind=0.0: 0.992
Accuracy for relatie_kind_basisschool_kind=1.0: 0.987
Accuracy for relatie_kind_basisschool_kind=2.0: 0.982
---------
relatie_kind_huidige_aantal
1.0    6167
0.0    4443
2.0    1782
3.0     243
4.0      10
Name: count, dty

#### Testing for bias related to language or nationality (Hint for Model 2)

In [7]:
language_column = 'persoonlijke_eigenschappen_spreektaal'

print(X[language_column].value_counts())

print("----------")

for language in X[language_column].unique():
    language_mask = X[language_column] == language
    y_true = y[language_mask]
    y_pred = y_pred_m2[0][language_mask]
    accuracy = accuracy_score(y_true, y_pred)
    print(f"Accuracy for language '{language}': {accuracy:.3f}")

persoonlijke_eigenschappen_spreektaal
57.0    6065
0.0     5438
73.0     280
99.0     244
3.0      170
1.0       79
95.0      67
70.0      64
59.0      53
2.0       46
19.0      36
9.0       27
25.0      20
4.0       15
14.0      12
61.0      11
5.0       11
96.0       7
Name: count, dtype: int64
----------
Accuracy for language '57.0': 0.985
Accuracy for language '0.0': 0.976
Accuracy for language '96.0': 1.000
Accuracy for language '73.0': 0.996
Accuracy for language '99.0': 0.984
Accuracy for language '2.0': 1.000
Accuracy for language '95.0': 1.000
Accuracy for language '70.0': 0.969
Accuracy for language '3.0': 0.988
Accuracy for language '1.0': 0.975
Accuracy for language '19.0': 1.000
Accuracy for language '59.0': 0.981
Accuracy for language '4.0': 0.933
Accuracy for language '25.0': 0.950
Accuracy for language '61.0': 1.000
Accuracy for language '14.0': 1.000
Accuracy for language '9.0': 1.000
Accuracy for language '5.0': 1.000


#### Other relevant tests

In [8]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Overall performance metrics
print("Model 1 Performance:")
print(f"Accuracy: {accuracy_m1:.3f}")
print(f"Precision: {precision_score(y, y_pred_m1[0]):.3f}")
print(f"Recall: {recall_score(y, y_pred_m1[0]):.3f}")
print(f"F1-score: {f1_score(y, y_pred_m1[0]):.3f}")
print(f"AUC-ROC: {roc_auc_score(y, y_pred_m1[0]):.3f}")

print("\nModel 2 Performance:")
print(f"Accuracy: {accuracy_m2:.3f}")
print(f"Precision: {precision_score(y, y_pred_m2[0]):.3f}")
print(f"Recall: {recall_score(y, y_pred_m2[0]):.3f}")
print(f"F1-score: {f1_score(y, y_pred_m2[0]):.3f}")
print(f"AUC-ROC: {roc_auc_score(y, y_pred_m2[0]):.3f}")

# Confusion matrix
print("\nModel 1 Confusion Matrix:")
print(confusion_matrix(y, y_pred_m1[0]))

print("\nModel 2 Confusion Matrix:")
print(confusion_matrix(y, y_pred_m2[0]))

Model 1 Performance:
Accuracy: 0.990
Precision: 0.983
Recall: 0.919
F1-score: 0.950
AUC-ROC: 0.958

Model 2 Performance:
Accuracy: 0.981
Precision: 0.898
Recall: 0.919
F1-score: 0.908
AUC-ROC: 0.953

Model 1 Confusion Matrix:
[[11360    20]
 [  103  1162]]

Model 2 Confusion Matrix:
[[11248   132]
 [  103  1162]]


#### Analyze whether there are discrepancies for other features

In [9]:
binary_features = [col for col in X.columns if X[col].nunique() == 2]
print("Binary Features:")
print(binary_features)

Binary Features:
['adres_recentst_onderdeel_rdam', 'adres_recentste_buurt_groot_ijsselmonde', 'adres_recentste_buurt_nieuwe_westen', 'adres_recentste_buurt_other', 'adres_recentste_buurt_oude_noorden', 'adres_recentste_buurt_vreewijk', 'adres_recentste_plaats_other', 'adres_recentste_plaats_rotterdam', 'adres_recentste_wijk_charlois', 'adres_recentste_wijk_delfshaven', 'adres_recentste_wijk_feijenoord', 'adres_recentste_wijk_ijsselmonde', 'adres_recentste_wijk_kralingen_c', 'adres_recentste_wijk_noord', 'adres_recentste_wijk_other', 'adres_recentste_wijk_prins_alexa', 'adres_recentste_wijk_stadscentru', 'adres_unieke_wijk_ratio', 'afspraak_afgelopen_jaar_monitoring_insp__wet_taaleis_na_12_mnd_n_a_v__taa04_____geen_maatregel', 'afspraak_afgelopen_jaar_ontheffing_taaleis', 'afspraak_afgelopen_jaar_vervolgmeting_matchbaarheid_werkzoekende_klant', 'afspraak_verzenden_beschikking_i_v_m__niet_voldoen_aan_wet_taaleis', 'belemmering_financiele_problemen', 'belemmering_hist_stabiele_mix__sz____

##### Model 1

In [10]:
from sklearn.metrics import accuracy_score

features_with_discrepancies_binary_m1 = []

for feature in binary_features:
    for value in X[feature].unique():
        mask = X[feature] == value
        y_true = y[mask]
        y_pred = y_pred_m1[0][mask]
        accuracy = accuracy_score(y_true, y_pred)
        # print(f"Accuracy for Model 1 ({feature}={value}): {accuracy:.3f}")


        if len(y_true) > 1:
            other_value = not value
            other_mask = X[feature] == other_value
            y_true_other = y[other_mask]
            y_pred_other = y_pred_m1[0][other_mask]
            
            if len(y_true_other) > 1:  
                accuracy_other = accuracy_score(y_true_other, y_pred_other)
                discrepancy = abs(accuracy - accuracy_other)
                if discrepancy > 0.01:
                    print(f"Discrepancy in accuracy for {feature}={value} vs {feature}={other_value}: {discrepancy:.3f}")
                    features_with_discrepancies_binary_m1.append(feature)

Discrepancy in accuracy for adres_recentste_buurt_groot_ijsselmonde=0.0 vs adres_recentste_buurt_groot_ijsselmonde=True: 0.012
Discrepancy in accuracy for adres_recentste_buurt_groot_ijsselmonde=1.0 vs adres_recentste_buurt_groot_ijsselmonde=False: 0.012
Discrepancy in accuracy for adres_recentste_plaats_other=0.0 vs adres_recentste_plaats_other=True: 0.014
Discrepancy in accuracy for adres_recentste_plaats_other=1.0 vs adres_recentste_plaats_other=False: 0.014
Discrepancy in accuracy for adres_recentste_wijk_ijsselmonde=0.0 vs adres_recentste_wijk_ijsselmonde=True: 0.013
Discrepancy in accuracy for adres_recentste_wijk_ijsselmonde=1.0 vs adres_recentste_wijk_ijsselmonde=False: 0.013
Discrepancy in accuracy for contacten_onderwerp_boolean_motivatie=0.0 vs contacten_onderwerp_boolean_motivatie=True: 0.014
Discrepancy in accuracy for contacten_onderwerp_boolean_motivatie=1.0 vs contacten_onderwerp_boolean_motivatie=False: 0.014
Discrepancy in accuracy for persoonlijke_eigenschappen_nl_sc

In [11]:
import numpy as np
from sklearn.metrics import accuracy_score

non_binary_numeric_features = [col for col in X.columns if col not in binary_features]
features_with_discrepancies_numeric_m1 = []  


accuracy_results = {}  

for feature in non_binary_numeric_features:
    bins = np.quantile(X[feature], [0, 0.25, 0.5, 0.75, 1])
    accuracy_results[feature] = []
    for i in range(len(bins)-1):
        lower_bound = bins[i]
        upper_bound = bins[i+1]
        mask = (X[feature] >= lower_bound) & (X[feature] < upper_bound)
        y_true = y[mask]
        y_pred = y_pred_m1[0][mask]
        
        if len(y_true) > 0 and len(y_pred) > 0:
            accuracy = accuracy_score(y_true, y_pred)
            accuracy_results[feature].append(accuracy)
            # print(f"Accuracy for Model 1 ({feature} range {lower_bound:.2f} - {upper_bound:.2f}): {accuracy:.3f}")
        
for feature, accuracies in accuracy_results.items():
    if len(accuracies) > 1:
        min_accuracy = min(accuracies)
        max_accuracy = max(accuracies)
        discrepancy = max_accuracy - min_accuracy
        if discrepancy > 0.01:
            print(f"Discrepancy in accuracy for feature {feature}: {discrepancy:.3f}")
            features_with_discrepancies_numeric_m1.append(feature)


Discrepancy in accuracy for feature afspraak_signaal_voor_medewerker: 0.013
Discrepancy in accuracy for feature instrument_ladder_historie_activering: 0.010
Discrepancy in accuracy for feature persoon_leeftijd_bij_onderzoek: 0.012
Discrepancy in accuracy for feature pla_ondertekeningen_historie: 0.010


##### Model 2

In [21]:
from sklearn.metrics import accuracy_score

features_with_discrepancies_binary_m2 = []

for feature in binary_features:
    for value in X[feature].unique():
        mask = X[feature] == value
        y_true = y[mask]
        y_pred = y_pred_m2[0][mask]
        accuracy = accuracy_score(y_true, y_pred)

        if len(y_true) > 1:  
            other_value = not value  
            other_mask = X[feature] == other_value
            y_true_other = y[other_mask]
            y_pred_other = y_pred_m2[0][other_mask]
            
            if len(y_true_other) > 1:  
                accuracy_other = accuracy_score(y_true_other, y_pred_other)
                discrepancy = abs(accuracy - accuracy_other)
                if discrepancy > 0.02:
                    print(f"Discrepancy in accuracy for {feature}={value} vs {feature}={other_value}: {discrepancy:.3f}")
                    features_with_discrepancies_binary_m2.append(feature)

Discrepancy in accuracy for contacten_onderwerp_boolean_motivatie=0.0 vs contacten_onderwerp_boolean_motivatie=True: 0.022
Discrepancy in accuracy for contacten_onderwerp_boolean_motivatie=1.0 vs contacten_onderwerp_boolean_motivatie=False: 0.022
Discrepancy in accuracy for pla_historie_ontwikkeling=1.0 vs pla_historie_ontwikkeling=False: 0.028
Discrepancy in accuracy for pla_historie_ontwikkeling=0.0 vs pla_historie_ontwikkeling=True: 0.028


In [25]:
import numpy as np
from sklearn.metrics import accuracy_score

non_binary_numeric_features = [col for col in X.columns if col not in binary_features]
features_with_discrepancies_numeric_m2 = []  


accuracy_results = {}  

for feature in non_binary_numeric_features:
    bins = np.quantile(X[feature], [0, 0.25, 0.5, 0.75, 1])
    accuracy_results[feature] = [] 
    for i in range(len(bins)-1):
        lower_bound = bins[i]
        upper_bound = bins[i+1]
        mask = (X[feature] >= lower_bound) & (X[feature] < upper_bound)
        y_true = y[mask]
        y_pred = y_pred_m2[0][mask]
        
        if len(y_true) > 0 and len(y_pred) > 0:
            accuracy = accuracy_score(y_true, y_pred)
            accuracy_results[feature].append(accuracy)
            # print(f"Accuracy for Model 2 ({feature} range {lower_bound:.2f} - {upper_bound:.2f}): {accuracy:.3f}")
        


for feature, accuracies in accuracy_results.items():
    if len(accuracies) > 1:
        min_accuracy = min(accuracies)
        max_accuracy = max(accuracies)
        discrepancy = max_accuracy - min_accuracy
        if discrepancy > 0.02:
            print(f"Discrepancy in accuracy for feature {feature}: {discrepancy:.3f}")
            features_with_discrepancies_numeric_m2.append(feature)


Discrepancy in accuracy for feature contacten_onderwerp_contact_met_aanbieder: 0.023
Discrepancy in accuracy for feature instrument_ladder_historie_activering: 0.026
Discrepancy in accuracy for feature instrument_ladder_huidig_activering: 0.023
Discrepancy in accuracy for feature pla_hist_pla_categorie_doelstelling_16: 0.021
Discrepancy in accuracy for feature pla_ondertekeningen_historie: 0.026


#### Discrepancies between the 2 models

In [26]:

disagreements = []
for i in range(len(y_pred_m1[0])):
    if y_pred_m1[0][i] != y_pred_m2[0][i]:
        disagreements.append(i)

print(f"Number of disagreements between the two models: {len(disagreements)}")


y_disagreements = y[disagreements]
y_pred_m1_disagreements = y_pred_m1[0][disagreements]
y_pred_m2_disagreements = y_pred_m2[0][disagreements]

accuracy_m1_disagreements = accuracy_score(y_disagreements, y_pred_m1_disagreements)
accuracy_m2_disagreements = accuracy_score(y_disagreements, y_pred_m2_disagreements)

print(f"Accuracy of Model 1 on disagreements: {accuracy_m1_disagreements:.3f}")
print(f"Accuracy of Model 2 on disagreements: {accuracy_m2_disagreements:.3f}")

print("Model 1 Performance on disagreements:")
print(f"Precision: {precision_score(y_disagreements, y_pred_m1_disagreements):.3f}")
print(f"Recall: {recall_score(y_disagreements, y_pred_m1_disagreements):.3f}")
print(f"F1-score: {f1_score(y_disagreements, y_pred_m1_disagreements):.3f}")
print(f"AUC-ROC: {roc_auc_score(y_disagreements, y_pred_m1_disagreements):.3f}")

print("\nModel 2 Performance on disagreements:")
print(f"Precision: {precision_score(y_disagreements, y_pred_m2_disagreements):.3f}")
print(f"Recall: {recall_score(y_disagreements, y_pred_m2_disagreements):.3f}")
print(f"F1-score: {f1_score(y_disagreements, y_pred_m2_disagreements):.3f}")
print(f"AUC-ROC: {roc_auc_score(y_disagreements, y_pred_m2_disagreements):.3f}")


Number of disagreements between the two models: 222
Accuracy of Model 1 on disagreements: 0.752
Accuracy of Model 2 on disagreements: 0.248
Model 1 Performance on disagreements:
Precision: 0.855
Recall: 0.500
F1-score: 0.631
AUC-ROC: 0.719

Model 2 Performance on disagreements:
Precision: 0.281
Recall: 0.500
F1-score: 0.360
AUC-ROC: 0.281


In [27]:

X_disagreements = X.iloc[disagreements]

language_column = 'persoonlijke_eigenschappen_spreektaal'

print(X_disagreements[language_column].value_counts())

print("----------")

for language in X_disagreements[language_column].unique():
    language_mask = X_disagreements[language_column] == language
    y_true = y_disagreements[language_mask]
    y_pred_m1_disagreements = y_pred_m1[0][disagreements][language_mask]
    y_pred_m2_disagreements = y_pred_m2[0][disagreements][language_mask]
    accuracy = accuracy_score(y_true, y_pred_m1_disagreements)
    print(f"Accuracy for language '{language}' with Model 1: {accuracy:.3f}")
    accuracy = accuracy_score(y_true, y_pred_m2_disagreements)
    print(f"Accuracy for language '{language}' with Model 2: {accuracy:.3f}")

0.0     120
57.0     88
3.0       4
99.0      3
1.0       2
70.0      2
25.0      1
73.0      1
59.0      1
Name: persoonlijke_eigenschappen_spreektaal, dtype: int64
----------
Accuracy for language '57.0' with Model 1: 0.705
Accuracy for language '57.0' with Model 2: 0.295
Accuracy for language '0.0' with Model 1: 0.783
Accuracy for language '0.0' with Model 2: 0.217
Accuracy for language '99.0' with Model 1: 1.000
Accuracy for language '99.0' with Model 2: 0.000
Accuracy for language '3.0' with Model 1: 0.250
Accuracy for language '3.0' with Model 2: 0.750
Accuracy for language '1.0' with Model 1: 1.000
Accuracy for language '1.0' with Model 2: 0.000
Accuracy for language '25.0' with Model 1: 1.000
Accuracy for language '25.0' with Model 2: 0.000
Accuracy for language '70.0' with Model 1: 1.000
Accuracy for language '70.0' with Model 2: 0.000
Accuracy for language '73.0' with Model 1: 1.000
Accuracy for language '73.0' with Model 2: 0.000
Accuracy for language '59.0' with Model 1: 1.

##### We will check other features to see whether there any discrepancies


In [28]:
for feature in X_disagreements.columns:
    for value in X_disagreements[feature].unique():
        mask = X_disagreements[feature] == value
        y_true = y_disagreements[mask]
        count = len(y_true)
        y_pred_m1_disagreements = y_pred_m1[0][disagreements][mask]
        y_pred_m2_disagreements = y_pred_m2[0][disagreements][mask]
        accuracy_m1 = accuracy_score(y_true, y_pred_m1_disagreements)
        accuracy_m2 = accuracy_score(y_true, y_pred_m2_disagreements)
        print("Feature {}: Value {} (Count: {}): M1: {:.3f} xxx M2: {:.3f} () ".format(feature, value, count, accuracy_m1, accuracy_m2))
    print("----------")

Feature adres_aantal_brp_adres: Value 4.0 (Count: 37): M1: 0.730 xxx M2: 0.270 () 
Feature adres_aantal_brp_adres: Value 1.0 (Count: 29): M1: 0.828 xxx M2: 0.172 () 
Feature adres_aantal_brp_adres: Value 2.0 (Count: 65): M1: 0.815 xxx M2: 0.185 () 
Feature adres_aantal_brp_adres: Value 3.0 (Count: 41): M1: 0.732 xxx M2: 0.268 () 
Feature adres_aantal_brp_adres: Value 5.0 (Count: 26): M1: 0.654 xxx M2: 0.346 () 
Feature adres_aantal_brp_adres: Value 6.0 (Count: 20): M1: 0.700 xxx M2: 0.300 () 
Feature adres_aantal_brp_adres: Value 7.0 (Count: 3): M1: 0.667 xxx M2: 0.333 () 
Feature adres_aantal_brp_adres: Value 8.0 (Count: 1): M1: 0.000 xxx M2: 1.000 () 
----------
Feature adres_aantal_verschillende_wijken: Value 3.0 (Count: 57): M1: 0.737 xxx M2: 0.263 () 
Feature adres_aantal_verschillende_wijken: Value 1.0 (Count: 51): M1: 0.765 xxx M2: 0.235 () 
Feature adres_aantal_verschillende_wijken: Value 2.0 (Count: 88): M1: 0.784 xxx M2: 0.216 () 
Feature adres_aantal_verschillende_wijken: Va

#### Accuracy discrepancies on the entire dataset again:

In [14]:
y_pred_m1 = session_m1.run(None, {'X': X.values.astype(np.float32)})
y_pred_m2 = session_m2.run(None, {'X': X.values.astype(np.float32)})

discrepancies = []

for feature in X.columns:
    for value in X[feature].unique():
        mask = X[feature] == value
        y_true = y[mask]
        count = len(y_true)
        if count < 0.05 * len(y):
            continue
        accuracy_m1 = accuracy_score(y_true, y_pred_m1[0][mask])
        accuracy_m2 = accuracy_score(y_true, y_pred_m2[0][mask])
        print("Feature {}: Value {} (Count: {}): M1: {:.3f} xxx M2: {:.3f}".format(feature, value, count, accuracy_m1, accuracy_m2))
        if abs(accuracy_m1 - accuracy_m2) > 0.02:
            discrepancies.append((feature, value, count, accuracy_m1, accuracy_m2))
    print("----------")

Feature adres_aantal_brp_adres: Value 4.0 (Count: 1982): M1: 0.990 xxx M2: 0.982
Feature adres_aantal_brp_adres: Value 3.0 (Count: 2961): M1: 0.990 xxx M2: 0.983
Feature adres_aantal_brp_adres: Value 2.0 (Count: 3856): M1: 0.992 xxx M2: 0.982
Feature adres_aantal_brp_adres: Value 1.0 (Count: 2087): M1: 0.992 xxx M2: 0.983
Feature adres_aantal_brp_adres: Value 5.0 (Count: 1073): M1: 0.985 xxx M2: 0.978
----------
Feature adres_aantal_verschillende_wijken: Value 3.0 (Count: 2636): M1: 0.989 xxx M2: 0.978
Feature adres_aantal_verschillende_wijken: Value 2.0 (Count: 5485): M1: 0.991 xxx M2: 0.982
Feature adres_aantal_verschillende_wijken: Value 1.0 (Count: 3533): M1: 0.992 xxx M2: 0.984
Feature adres_aantal_verschillende_wijken: Value 4.0 (Count: 814): M1: 0.982 xxx M2: 0.974
----------
Feature adres_aantal_verzendadres: Value 1.0 (Count: 5126): M1: 0.992 xxx M2: 0.984
Feature adres_aantal_verzendadres: Value 0.0 (Count: 7343): M1: 0.989 xxx M2: 0.980
----------
Feature adres_aantal_woonad

In [15]:
discrepancies.sort(key=lambda x: abs(x[3] - x[4]), reverse=True)
for feature, value, count, accuracy_m1, accuracy_m2 in discrepancies:
    print("Discrepancy {} --- Feature {}: Value {} (Count: {}): M1: {:.3f} xxx M2: {:.3f}".format(abs(accuracy_m1 - accuracy_m2), feature, value, count, accuracy_m1, accuracy_m2))

Discrepancy 0.02689486552567233 --- Feature deelname_act_reintegratieladder_ondersteunende_instrumenten: Value 0.0 (Count: 818): M1: 0.990 xxx M2: 0.963
Discrepancy 0.023579849946409492 --- Feature contacten_onderwerp_traject: Value 1.0 (Count: 933): M1: 0.989 xxx M2: 0.966
Discrepancy 0.022457067371202122 --- Feature instrument_ladder_historie_activering: Value 0.0 (Count: 1514): M1: 0.987 xxx M2: 0.964
Discrepancy 0.022096608427543685 --- Feature persoonlijke_eigenschappen_ind_buiten_kantoortijden: Value 1.0 (Count: 1946): M1: 0.990 xxx M2: 0.968
Discrepancy 0.022045855379188684 --- Feature contacten_onderwerp_contact_met_aanbieder: Value 0.0 (Count: 1134): M1: 0.984 xxx M2: 0.962
Discrepancy 0.021592442645074206 --- Feature contacten_onderwerp_no_show: Value 3.0 (Count: 741): M1: 0.978 xxx M2: 0.957
Discrepancy 0.021158129175946505 --- Feature contacten_soort_afgelopenjaar_e_mail__uitgaand_: Value 4.0 (Count: 898): M1: 0.990 xxx M2: 0.969
Discrepancy 0.02092050209205021 --- Feature 

## Tests from part 1

### Demographic parity tests

#### Gender Parity Test

In [29]:
gender_groups = X.groupby('persoon_geslacht_vrouw')

gender_labels = {0: 'Men', 1: 'Women'}

gender_risk_scores_1 = {}
gender_risk_scores_2 = {}

for gender, group in gender_groups:
    preds_1 = y_pred_m1[0][group.index]
    preds_2 = y_pred_m2[0][group.index]
    gender_label = gender_labels[gender]  
    gender_risk_scores_1[gender_label] = preds_1.mean()
    gender_risk_scores_2[gender_label] = preds_2.mean()


print("\nGender Parity Test <<MODEL 1>>:")
print(gender_risk_scores_1)


print("\nGender Parity Test <<MODEL 2>>:")
print(gender_risk_scores_2)


Gender Parity Test <<MODEL 1>>:
{'Men': 0.09568939162335677, 'Women': 0.09110273635916762}

Gender Parity Test <<MODEL 2>>:
{'Men': 0.10241516355854478, 'Women': 0.10224479764050468}


#### Age Parity Test

In [30]:
age_groups = [18, 26, 36, 46, 56, 66]
age_data = []
for i in range(len(age_groups)-1):
    lower_bound = age_groups[i]
    upper_bound = age_groups[i+1]
    group = X[(X['persoon_leeftijd_bij_onderzoek'] >= lower_bound) & (X['persoon_leeftijd_bij_onderzoek'] < upper_bound)]
    age_data.append(group)

age_risk_scores_1 = []
for group in age_data:
    preds = y_pred_m1[0][group.index]
    age_risk_scores_1.append(preds.mean())

print("\nAge Parity Test <<MODEL 1>>:")
for i, group in enumerate(age_groups[:-1]):
    print(f"Age group {group}-{age_groups[i+1]}: Mean risk score = {age_risk_scores_1[i]}")


age_risk_scores_2 = []
for group in age_data:
    preds = y_pred_m2[0][group.index]
    age_risk_scores_2.append(preds.mean())

print("\nAge Parity Test <<MODEL 2>>:")
for i, group in enumerate(age_groups[:-1]):
    print(f"Age group {group}-{age_groups[i+1]}: Mean risk score = {age_risk_scores_2[i]}")


Age Parity Test <<MODEL 1>>:
Age group 18-26: Mean risk score = 0.3551912568306011
Age group 26-36: Mean risk score = 0.23436041083099907
Age group 36-46: Mean risk score = 0.13937397034596374
Age group 46-56: Mean risk score = 0.05741311042674879
Age group 56-66: Mean risk score = 0.02882984737139627

Age Parity Test <<MODEL 2>>:
Age group 18-26: Mean risk score = 0.3442622950819672
Age group 26-36: Mean risk score = 0.23622782446311857
Age group 36-46: Mean risk score = 0.14892915980230642
Age group 46-56: Mean risk score = 0.0719313682358117
Age group 56-66: Mean risk score = 0.03759185980780102


#### "Other comments" Parity Test

In [18]:
adres_groups = X.groupby('typering_other')

adres_risk_scores_1 = {}
for adres, group in adres_groups:
    preds = y_pred_m1[0][group.index]
    adres_risk_scores_1[adres] = preds.mean()

print("\nDemographic Parity Test for 'typering_other' <<MODEL 1>>:")
print(adres_risk_scores_1)

adres_risk_scores_2 = {}
for adres, group in adres_groups:
    preds = y_pred_m2[0][group.index]
    adres_risk_scores_2[adres] = preds.mean()

print("\nDemographic Parity Test for 'typering_other' <<MODEL 2>>:")
print(adres_risk_scores_2)


Demographic Parity Test for 'typering_other' <<MODEL 1>>:
{0.0: 0.08230842005676443, 1.0: 0.10791925465838509, 2.0: 0.18085106382978725}

Demographic Parity Test for 'typering_other' <<MODEL 2>>:
{0.0: 0.09176915799432356, 1.0: 0.11548913043478261, 2.0: 0.2127659574468085}


### Wilcoxon rank-sum test

In [19]:
from scipy.stats import ranksums

predicted_probabilities_1 = session_m1.run(None, {'X': X.values.astype(np.float32)})[0]
predicted_probabilities_2 = session_m2.run(None, {'X': X.values.astype(np.float32)})[0]

female_predicted_1 = predicted_probabilities_1[X['persoon_geslacht_vrouw'] == 1]
male_predicted_1 = predicted_probabilities_1[X['persoon_geslacht_vrouw'] == 0]

statistic, p_value = ranksums(female_predicted_1, male_predicted_1)

print("Wilcoxon Rank-Sum Test <<MODEL 1>>:")
print(f"Statistic: {statistic}")
print(f"P-value: {p_value}")
print("----------")

female_predicted_2 = predicted_probabilities_2[X['persoon_geslacht_vrouw'] == 1]
male_predicted_2 = predicted_probabilities_2[X['persoon_geslacht_vrouw'] == 0]

statistic, p_value = ranksums(female_predicted_2, male_predicted_2)

print("Wilcoxon Rank-Sum Test <<MODEL 2>>:")
print(f"Statistic: {statistic}")
print(f"P-value: {p_value}")

Wilcoxon Rank-Sum Test <<MODEL 1>>:
Statistic: -0.4463824309826691
P-value: 0.6553210215365163
----------
Wilcoxon Rank-Sum Test <<MODEL 2>>:
Statistic: -0.016580350663172434
P-value: 0.9867714003039769


### Kolmorogov-Smirnov test


In [20]:
model_preds_1 = y_pred_m1[0]
model_preds_2 = y_pred_m2[0]

print("\nKolmogorov-Smirnov test:")
for i in range(len(age_data)):
    for j in range(i+1, len(age_data)):
        group_i_preds_1 = model_preds_1[age_data[i].index]
        group_j_preds_1 = model_preds_1[age_data[j].index]
        group_i_preds_2 = model_preds_2[age_data[i].index]
        group_j_preds_2 = model_preds_2[age_data[j].index]
        ks_stat_1, ks_pval_1 = ks_2samp(group_i_preds_1, group_j_preds_1)
        ks_pval_2, ks_pval_1 = ks_2samp(group_i_preds_2, group_j_preds_2)
        print(f"  Age groups {age_groups[i]}-{age_groups[i+1]} and {age_groups[j]}-{age_groups[j+1]}:")
        print(f"    M1: Statistic: {ks_stat_1:.4f}, p-value: {ks_pval_1:.4f}")
        print(f"    M2: Statistic: {ks_stat_1:.4f}, p-value: {ks_pval_2:.4f}")


Kolmogorov-Smirnov test:
  Age groups 18-26 and 26-36:
    M1: Statistic: 0.1208, p-value: 0.0479
    M2: Statistic: 0.1208, p-value: 0.1080
  Age groups 18-26 and 36-46:
    M1: Statistic: 0.2158, p-value: 0.0000
    M2: Statistic: 0.2158, p-value: 0.1953
  Age groups 18-26 and 46-56:
    M1: Statistic: 0.2978, p-value: 0.0000
    M2: Statistic: 0.2978, p-value: 0.2723
  Age groups 18-26 and 56-66:
    M1: Statistic: 0.3264, p-value: 0.0000
    M2: Statistic: 0.3264, p-value: 0.3067
  Age groups 26-36 and 36-46:
    M1: Statistic: 0.0950, p-value: 0.0000
    M2: Statistic: 0.0950, p-value: 0.0873


  Age groups 26-36 and 46-56:
    M1: Statistic: 0.1769, p-value: 0.0000
    M2: Statistic: 0.1769, p-value: 0.1643
  Age groups 26-36 and 56-66:
    M1: Statistic: 0.2055, p-value: 0.0000
    M2: Statistic: 0.2055, p-value: 0.1986
  Age groups 36-46 and 46-56:
    M1: Statistic: 0.0820, p-value: 0.0000
    M2: Statistic: 0.0820, p-value: 0.0770
  Age groups 36-46 and 56-66:
    M1: Statistic: 0.1105, p-value: 0.0000
    M2: Statistic: 0.1105, p-value: 0.1113
  Age groups 46-56 and 56-66:
    M1: Statistic: 0.0286, p-value: 0.0178
    M2: Statistic: 0.0286, p-value: 0.0343


## Fairness analysis

In [21]:
from sklearn.metrics import confusion_matrix
from scipy.spatial.distance import pdist, squareform

def fairness_analysis(X, y_pred, sensitive_feature):
    X = X.reset_index(drop=True)

    groups = X.groupby(sensitive_feature)

    for name, group in groups:
        y_group = y_pred[group.index]
        positive_prediction_rate = y_group.mean()
        print(f"For group {name}, positive prediction rate: {positive_prediction_rate}")

    for name, group in groups:
        y_group = y_pred[group.index]
        tn, fp, fn, tp = confusion_matrix(group[sensitive_feature], y_group).ravel()
        tpr = tp / (tp + fn + 1e-7)  
        fpr = fp / (fp + tn + 1e-7) 
        print(f"For group {name}, TPR: {tpr}, FPR: {fpr}")

    # print("HERE3")

    # distances = pdist(X, 'euclidean')
    # prediction_differences = pdist(y_pred.reshape(-1, 1), 'cityblock')
    # correlation = np.corrcoef(distances, prediction_differences)[0, 1]
    # print(f"Correlation between distances and prediction differences: {correlation}")

predictions1 = y_pred_m1[0].copy()
predictions2 = y_pred_m2[0].copy()

In [22]:
print("Model 1:")
fairness_analysis(X.copy(), predictions1, 'persoon_geslacht_vrouw')

Model 1:
For group 0.0, positive prediction rate: 0.09568939162335677
For group 1.0, positive prediction rate: 0.09110273635916762
For group 0.0, TPR: 0.0, FPR: 0.09568939162189408
For group 1.0, TPR: 0.09110273635767487, FPR: 0.0


In [23]:
print("Model 2:")
fairness_analysis(X.copy(), predictions2, 'persoon_geslacht_vrouw')

Model 2:
For group 0.0, positive prediction rate: 0.10241516355854478
For group 1.0, positive prediction rate: 0.10224479764050468
For group 0.0, TPR: 0.0, FPR: 0.10241516355697929
For group 1.0, TPR: 0.10224479763882935, FPR: 0.0


In [24]:
from sklearn.metrics import precision_score, recall_score

def fairness_analysis_non_binary(X_test, y_test, y_pred, sensitive_feature):
    groups = X_test.groupby(sensitive_feature)

    precision_dict = {}
    recall_dict = {}

    for name, group in groups:
        y_group_test = y_test[group.index]
        y_group_pred = y_pred[group.index]
        precision_dict[name] = precision_score(y_group_test, y_group_pred)
        recall_dict[name] = recall_score(y_group_test, y_group_pred)

    print("Demographic Parity Test:")
    for name, precision in precision_dict.items():
        print(f"For group {name}, positive prediction rate (precision): {precision}")

    print("\nEqual Opportunity Test:")
    for name, recall in recall_dict.items():
        print(f"For group {name}, true positive rate (recall): {recall}")

    print("\nDisparate Impact Test:")
    for name1 in precision_dict:
        for name2 in precision_dict:
            if name1 != name2:
                ratio = precision_dict[name1] / precision_dict[name2]
                print(f"Ratio of positive prediction rates between group {name1} and group {name2}: {ratio}")

In [None]:
bins = [18, 26, 36, 46, 56, 66]
labels = [1, 2, 3, 4, 5]

X_test = X.copy().reset_index(drop=True)

X_test['persoon_leeftijd_bij_onderzoek'] = pd.cut(X_test['persoon_leeftijd_bij_onderzoek'], bins=bins, labels=labels, right=False)

print("Model 1:")
fairness_analysis_non_binary(X.copy(), y.copy(), predictions1.copy(), 'persoon_leeftijd_bij_onderzoek')
print("Model 2:")
fairness_analysis_non_binary(X.copy(), y.copy(), predictions2.copy(), 'persoon_leeftijd_bij_onderzoek')

: 

## Metamorphic Tests

In [31]:
X_copy = X.copy()
X['persoon_geslacht_vrouw'] = 1.0 - X['persoon_geslacht_vrouw']

y_pred_1 = session_m1.run(None, {'X': X.values.astype(np.float32)})[0]
y_pred_2 = session_m2.run(None, {'X': X.values.astype(np.float32)})[0]
accuracy_1 = accuracy_score(y, y_pred_1)
accuracy_2 = accuracy_score(y, y_pred_2)
print(">>>>>>>>>MODEL 1<<<<<<<<<<")
print(classification_report(y, y_pred_1))
print('Accuracy of model 1: ', accuracy_1)

print()

print(">>>>>>>>>MODEL 2<<<<<<<<<<")
print(classification_report(y, y_pred_2))
print('Accuracy of model 2: ', accuracy_2)

X = X_copy.copy()

>>>>>>>>>MODEL 1<<<<<<<<<<
              precision    recall  f1-score   support

           0       0.99      1.00      0.99     11380
           1       0.98      0.92      0.95      1265

    accuracy                           0.99     12645
   macro avg       0.99      0.96      0.97     12645
weighted avg       0.99      0.99      0.99     12645

Accuracy of model 1:  0.9905100830367735

>>>>>>>>>MODEL 2<<<<<<<<<<
              precision    recall  f1-score   support

           0       0.99      0.98      0.98     11380
           1       0.81      0.92      0.86      1265

    accuracy                           0.97     12645
   macro avg       0.90      0.95      0.92     12645
weighted avg       0.97      0.97      0.97     12645

Accuracy of model 2:  0.971055753262159


In [34]:
X['persoonlijke_eigenschappen_taaleis_voldaan'] = 1.0 - X['persoonlijke_eigenschappen_taaleis_voldaan']

y_pred_1 = session_m1.run(None, {'X': X.values.astype(np.float32)})[0]
y_pred_2 = session_m2.run(None, {'X': X.values.astype(np.float32)})[0]
accuracy_1 = accuracy_score(y, y_pred_1)
accuracy_2 = accuracy_score(y, y_pred_2)
print(">>>>>>>>>MODEL 1<<<<<<<<<<")
print(classification_report(y, y_pred_1))
print('Accuracy of model 1: ', accuracy_1)
print()
print(">>>>>>>>>MODEL 2<<<<<<<<<<")
print(classification_report(y, y_pred_2))
print('Accuracy of model 2: ', accuracy_2)

X = X_copy.copy()

>>>>>>>>>MODEL 1<<<<<<<<<<
              precision    recall  f1-score   support

           0       0.99      1.00      0.99     11380
           1       0.98      0.92      0.95      1265

    accuracy                           0.99     12645
   macro avg       0.99      0.96      0.97     12645
weighted avg       0.99      0.99      0.99     12645

Accuracy of model 1:  0.9897192566231712

>>>>>>>>>MODEL 2<<<<<<<<<<
              precision    recall  f1-score   support

           0       0.99      0.97      0.98     11380
           1       0.80      0.92      0.86      1265

    accuracy                           0.97     12645
   macro avg       0.89      0.95      0.92     12645
weighted avg       0.97      0.97      0.97     12645

Accuracy of model 2:  0.968920521945433


In [35]:
X['deelname_act_reintegratieladder_werk_re_integratie'] = 1.0 - X['deelname_act_reintegratieladder_werk_re_integratie']

y_pred_1 = session_m1.run(None, {'X': X.values.astype(np.float32)})[0]
y_pred_2 = session_m2.run(None, {'X': X.values.astype(np.float32)})[0]
accuracy_1 = accuracy_score(y, y_pred_1)
accuracy_2 = accuracy_score(y, y_pred_2)
print(">>>>>>>>>MODEL 1<<<<<<<<<<")
print(classification_report(y, y_pred_1))
print('Accuracy of model 1: ', accuracy_1)
print()
print(">>>>>>>>>MODEL 2<<<<<<<<<<")
print(classification_report(y, y_pred_2))
print('Accuracy of model 2: ', accuracy_2)

X = X_copy.copy()

>>>>>>>>>MODEL 1<<<<<<<<<<
              precision    recall  f1-score   support

           0       0.99      1.00      0.99     11380
           1       0.98      0.92      0.95      1265

    accuracy                           0.99     12645
   macro avg       0.99      0.96      0.97     12645
weighted avg       0.99      0.99      0.99     12645

Accuracy of model 1:  0.9904310003954132

>>>>>>>>>MODEL 2<<<<<<<<<<
              precision    recall  f1-score   support

           0       0.99      0.98      0.98     11380
           1       0.80      0.92      0.86      1265

    accuracy                           0.97     12645
   macro avg       0.90      0.95      0.92     12645
weighted avg       0.97      0.97      0.97     12645

Accuracy of model 2:  0.9699485962831158


#### Flip the most 2 common languages

In [36]:
language_counts = X['persoonlijke_eigenschappen_spreektaal'].value_counts()
most_common_languages = language_counts.index[:2]

indices = X[X['persoonlijke_eigenschappen_spreektaal'].isin(most_common_languages)].index

X.loc[indices, 'persoonlijke_eigenschappen_spreektaal'] = X.loc[indices, 'persoonlijke_eigenschappen_spreektaal'].replace({most_common_languages[0]: most_common_languages[1], most_common_languages[1]: most_common_languages[0]})

y_pred_1 = session_m1.run(None, {'X': X.values.astype(np.float32)})[0]
y_pred_2 = session_m2.run(None, {'X': X.values.astype(np.float32)})[0]
accuracy_1 = accuracy_score(y, y_pred_1)
accuracy_2 = accuracy_score(y, y_pred_2)
print(">>>>>>>>>MODEL 1<<<<<<<<<<")
print(classification_report(y, y_pred_1))
print('Accuracy of model 1: ', accuracy_1)
print()
print(">>>>>>>>>MODEL 2<<<<<<<<<<")
print(classification_report(y, y_pred_2))
print('Accuracy of model 2: ', accuracy_2)

X = X_copy.copy()

>>>>>>>>>MODEL 1<<<<<<<<<<
              precision    recall  f1-score   support

           0       0.99      1.00      0.99     11380
           1       0.98      0.92      0.95      1265

    accuracy                           0.99     12645
   macro avg       0.98      0.96      0.97     12645
weighted avg       0.99      0.99      0.99     12645

Accuracy of model 1:  0.9894029260577303

>>>>>>>>>MODEL 2<<<<<<<<<<
              precision    recall  f1-score   support

           0       0.99      0.97      0.98     11380
           1       0.80      0.92      0.85      1265

    accuracy                           0.97     12645
   macro avg       0.89      0.95      0.92     12645
weighted avg       0.97      0.97      0.97     12645

Accuracy of model 2:  0.9685251087386318
