In [1]:
import os, sys
dir2 = os.path.abspath('')
dir1 = os.path.dirname(dir2)
if not dir1 in sys.path: sys.path.append(dir1)

In [45]:
from brio.utils.Preprocessing import Preprocessing
from sklearn.model_selection import train_test_split
from pickle import dump, load
import pandas as pd
import numpy as np

from brio.bias.FreqVsFreqBiasDetector import FreqVsFreqBiasDetector
from brio.bias.FreqVsRefBiasDetector import FreqVsRefBiasDetector

## Importing Data and Trained Classifier

In [3]:
input_data_path = "../data/raw_data/uci-default-of-credit-card/data/data.csv"
local_path_save = '../data/mlflow_artifacts/'

In [4]:
fitted_ohe = load(open(local_path_save + '_ohe.pkl', 'rb')) 
fitted_scaler = load(open(local_path_save + '_scaler.pkl', 'rb'))

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


In [5]:
pp = Preprocessing(input_data_path, "default")
X, Y = pp.read_dataframe()

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.3, random_state=420)

X_test_ohe, _, _ = pp.preprocess_for_classification(df=X_test, 
                                                fit_ohe=True, 
                                                fitted_ohe=fitted_ohe,
                                                perform_scaling=True,
                                                fitted_scaler=fitted_scaler)

In [6]:
with open("./mlruns/1/1e4a0667c7a64cbe8c7b023410e5781c/artifacts/model/model.pkl", "rb") as file:
    classifier = load(file)

In [7]:
predicted_prob = classifier.predict_proba(X_test_ohe)
predicted_values = classifier.predict(X_test_ohe)

#### Definition of conditioning variables

In [8]:
def age_buckets(x):
    if x < 30:
        return 1
    elif x < 40:
        return 2
    else:
        return 3

X_test['age_buckets'] = X.x5_age.apply(age_buckets)

In [9]:
conditioning_variables = ['x3_education', 'x4_marriage', 'age_buckets']

In [10]:
df_with_predictions = pd.concat(
    [X_test.reset_index(drop=True), pd.Series(predicted_values)], axis=1).rename(columns={0:"predictions"})

## Hazard and risk functions

In [11]:
def hazard_function(overall_result, conditioned_results, tot_observations):
    
    # test result, threshold, num_samples, boolean
    test_results = []
    test_results.append((overall_result[0], 
                    overall_result[2], 
                    tot_observations, 
                    overall_result[1]))
    
    for group in conditioned_results.values():
        if (group[1] is not None):
            test_results.append((group[1], group[3], group[0], group[2]))
    
    hazard = 0
    for line in test_results:
        weight = 1 #to be implemented
        delta = 1 if line[3]==False else 0
        q = line[2]/tot_observations
        e = line[0] - line[1]
        hazard += delta * weight * q * e
        
    average_threshold = np.mean([x[1] for x in test_results])
        
    return hazard, average_threshold

In [12]:
def hazard_function_2(overall_result, conditioned_results, tot_observations):
    
    # test result, threshold, num_samples, boolean
    test_results = []
    test_results.append((overall_result[0], 
                    overall_result[2], 
                    tot_observations, 
                    overall_result[1]))
    
    for group in conditioned_results.values():
        if (group[1] is not None):
            test_results.append((group[1], group[3], group[0], group[2]))
    
    hazard = 0
    for line in test_results:
        weight = 1 #to be implemented
        delta = 1 if line[3]==False else 0
        q = line[2]/tot_observations
        e = line[0] - line[1]
        hazard += delta * weight * q * e * line[1] #aggiunta di threshold del singolo gruppo
        
    average_threshold = np.mean([x[1] for x in test_results])
        
    return hazard, average_threshold

In [13]:
def risk_function(test_hazards, average_thresholds):
    # test_hazards = [list_of_hazards]
    # average_thresholds = [mean(thresholds_of_test1), mean(thresholds_of_a_test2), ...], 
    #    needed if automatic threshold is used
    risk = 0
    for hazard, threshold in zip(test_hazards, average_thresholds):
        risk += hazard * threshold
        
    risk = risk/len(test_hazards)**2
    
    return risk

In [14]:
def risk_function_2(test_hazards, average_thresholds):
    # test_hazards = [list_of_hazards]
    # average_thresholds = [mean(thresholds_of_test1), mean(thresholds_of_a_test2), ...], 
    #    needed if automatic threshold is used
    risk = 0
    for hazard, threshold in zip(test_hazards, average_thresholds):
        risk += hazard # tolto threshold
        
    risk = risk/len(test_hazards)**2
    
    return risk

### Test 1: TVD, A1=high

In [15]:
bd_1 = FreqVsFreqBiasDetector(distance="TVD", A1="high")

In [16]:
overall_1 = bd_1.compare_root_variable_groups(
    dataframe=df_with_predictions,
    target_variable='predictions',
    root_variable='x2_sex')

In [17]:
conditioned_1 = bd_1.compare_root_variable_conditioned_groups(
    dataframe=df_with_predictions,
    target_variable='predictions',
    root_variable='x2_sex',
    conditioning_variables=conditioning_variables)

In [18]:
hazard_test_1, average_threshold_1 = hazard_function(
    overall_1, 
    conditioned_1, 
    df_with_predictions.shape[0])

In [19]:
hazard_test_1

0.09976465885178575

In [20]:
hazard_test_1_2, average_threshold_1_2 = hazard_function_2(
    overall_1, 
    conditioned_1, 
    df_with_predictions.shape[0])

In [21]:
hazard_test_1_2

0.0016526941966088404

### Test 2 (TVD, low)

In [22]:
bd_2 = FreqVsFreqBiasDetector(distance="TVD", A1="low")

In [23]:
overall_2 = bd_2.compare_root_variable_groups(
    dataframe=df_with_predictions,
    target_variable='predictions',
    root_variable='x2_sex')

In [24]:
conditioned_2 = bd_2.compare_root_variable_conditioned_groups(
    dataframe=df_with_predictions,
    target_variable='predictions',
    root_variable='x2_sex',
    conditioning_variables=conditioning_variables)

In [25]:
hazard_test_2, average_threshold_2 = hazard_function(
    overall_2, 
    conditioned_2, 
    df_with_predictions.shape[0])

### Test 3 (JS, high)

In [26]:
bd_3 = FreqVsFreqBiasDetector(distance="JS", A1="high")

In [27]:
overall_3 = bd_3.compare_root_variable_groups(
    dataframe=df_with_predictions,
    target_variable='predictions',
    root_variable='x2_sex')

In [28]:
conditioned_3 = bd_3.compare_root_variable_conditioned_groups(
    dataframe=df_with_predictions,
    target_variable='predictions',
    root_variable='x2_sex',
    conditioning_variables=conditioning_variables)

In [29]:
hazard_test_3, average_threshold_3 = hazard_function(
    overall_3, 
    conditioned_3, 
    df_with_predictions.shape[0])

### Test 4 (JS, low)

In [30]:
bd_4 = FreqVsFreqBiasDetector(distance="JS", A1="low")

In [31]:
overall_4 = bd_4.compare_root_variable_groups(
    dataframe=df_with_predictions,
    target_variable='predictions',
    root_variable='x2_sex')

In [32]:
conditioned_4 = bd_4.compare_root_variable_conditioned_groups(
    dataframe=df_with_predictions,
    target_variable='predictions',
    root_variable='x2_sex',
    conditioning_variables=conditioning_variables)

In [33]:
hazard_test_4, average_threshold_4 = hazard_function(
    overall_4, 
    conditioned_4, 
    df_with_predictions.shape[0])

In [34]:
overall_4

(0.0011441803173238346, True, 0.038868585412256317, None)

In [35]:
conditioned_4

{'x3_education==1': (3119,
  0.000889500656873715,
  True,
  0.03895143959289674,
  None),
 'x3_education==3': (1499,
  0.0015440397130407938,
  True,
  0.03904057062398841,
  None),
 'x3_education==2': (4250,
  0.0014069427700605507,
  True,
  0.038922567122490655,
  None),
 'x3_education==4': (40, 0.0, True, 0.04052878118384472, None),
 'x3_education==5': (75, 0.01773188348797698, True, 0.04004903810567666, None),
 'x3_education==6': (14, None, 'Not enough observations'),
 'x3_education==0': (3, None, 'Not enough observations'),
 'x4_marriage==1': (4065,
  0.0006622420845844328,
  True,
  0.03892645023759367,
  None),
 'x4_marriage==2': (4822,
  0.0017816477735895428,
  True,
  0.03891200891721621,
  None),
 'x4_marriage==3': (95, 0.005139203671928956, True, 0.0399042256460958, None),
 'x4_marriage==0': (18, None, 'Not enough observations'),
 'age_buckets==3': (2727,
  0.0005520467561810636,
  True,
  0.038965431871108035,
  None),
 'age_buckets==1': (2895,
  0.00014080595312827762,


## Risk results

In [36]:
hazards = [hazard_test_1, hazard_test_2, hazard_test_3, hazard_test_4]
average_thresholds = [average_threshold_1, average_threshold_2, average_threshold_3, average_threshold_4]

In [37]:
risk_function(hazards, average_thresholds)

0.00015458111237839916

# Experiments with 3 models

In [38]:
with open("./trained_model_for_testing/RF_12_200.pkl", "rb") as file:
    classifier_1 = load(file)
    
with open("./trained_model_for_testing/RF_37_10.pkl", "rb") as file:
    classifier_2 = load(file)
    
with open("./trained_model_for_testing/Tree_depth2.pkl", "rb") as file:
    classifier_3 = load(file)

In [39]:
predicted_prob_1 = classifier_1.predict_proba(X_test_ohe)
predicted_values_1 = classifier_1.predict(X_test_ohe)
df_with_predictions_1 = pd.concat(
    [X_test.reset_index(drop=True), 
     pd.Series(predicted_values_1)], axis=1).rename(columns={0:"predictions"})

predicted_prob_2 = classifier_2.predict_proba(X_test_ohe)
predicted_values_2 = classifier_2.predict(X_test_ohe)
df_with_predictions_2 = pd.concat(
    [X_test.reset_index(drop=True), 
     pd.Series(predicted_values_2)], axis=1).rename(columns={0:"predictions"})

predicted_prob_3 = classifier_3.predict_proba(X_test_ohe)
predicted_values_3 = classifier_3.predict(X_test_ohe)
df_with_predictions_3 = pd.concat(
    [X_test.reset_index(drop=True), 
     pd.Series(predicted_values_3)], axis=1).rename(columns={0:"predictions"})

In [40]:
def test_model(data_frame):
    ### Test 1: TVD, A1=high

    bd_1 = FreqVsFreqBiasDetector(distance="TVD", A1="high")

    overall_1 = bd_1.compare_root_variable_groups(
        dataframe=data_frame,
        target_variable='predictions',
        root_variable='x2_sex')

    conditioned_1 = bd_1.compare_root_variable_conditioned_groups(
        dataframe=data_frame,
        target_variable='predictions',
        root_variable='x2_sex',
        conditioning_variables=conditioning_variables)

    hazard_test_1, average_threshold_1 = hazard_function(
        overall_1, 
        conditioned_1, 
        data_frame.shape[0])
    
    print("Test 1 (TVD, A1=high) hazard: ", hazard_test_1)

    ### Test 2 (TVD, low)

    bd_2 = FreqVsFreqBiasDetector(distance="TVD", A1="low")

    overall_2 = bd_2.compare_root_variable_groups(
        dataframe=data_frame,
        target_variable='predictions',
        root_variable='x2_sex')

    conditioned_2 = bd_2.compare_root_variable_conditioned_groups(
        dataframe=data_frame,
        target_variable='predictions',
        root_variable='x2_sex',
        conditioning_variables=conditioning_variables)

    hazard_test_2, average_threshold_2 = hazard_function(
        overall_2, 
        conditioned_2, 
        data_frame.shape[0])
    
    print("Test 2 (TVD, A1=low) hazard: ", hazard_test_2)

    ### Test 3 (JS, high)

    bd_3 = FreqVsFreqBiasDetector(distance="JS", A1="high")

    overall_3 = bd_3.compare_root_variable_groups(
        dataframe=data_frame,
        target_variable='predictions',
        root_variable='x2_sex')

    conditioned_3 = bd_3.compare_root_variable_conditioned_groups(
        dataframe=data_frame,
        target_variable='predictions',
        root_variable='x2_sex',
        conditioning_variables=conditioning_variables)

    hazard_test_3, average_threshold_3 = hazard_function(
        overall_3, 
        conditioned_3, 
        data_frame.shape[0])
    
    print("Test 3 (JS, A1=high) hazard: ", hazard_test_3)

    ### Test 4 (JS, low)

    bd_4 = FreqVsFreqBiasDetector(distance="JS", A1="low")

    overall_4 = bd_4.compare_root_variable_groups(
        dataframe=data_frame,
        target_variable='predictions',
        root_variable='x2_sex')

    conditioned_4 = bd_4.compare_root_variable_conditioned_groups(
        dataframe=data_frame,
        target_variable='predictions',
        root_variable='x2_sex',
        conditioning_variables=conditioning_variables)

    hazard_test_4, average_threshold_4 = hazard_function(
        overall_4, 
        conditioned_4, 
        data_frame.shape[0])
    
    print("Test 4 (JS, A1=low) hazard: ", hazard_test_4)
    
    hazards = [hazard_test_1, hazard_test_2, hazard_test_3, hazard_test_4]
    average_thresholds = [average_threshold_1, 
                      average_threshold_2, average_threshold_3, average_threshold_4]
    
    return risk_function(hazards, average_thresholds)

In [41]:
for model, df in zip(["RF_12_200", "RF_37_10", "Tree_depth2"],
              [df_with_predictions_1, df_with_predictions_2, df_with_predictions_3]):
    print(f"Overall risk measure for model {model}: ", test_model(df))
    print("\n")

Test 1 (TVD, A1=high) hazard:  0.09976465885178575
Test 2 (TVD, A1=low) hazard:  0.02015304845287825
Test 3 (JS, A1=high) hazard:  0.00011458216656976323
Test 4 (JS, A1=low) hazard:  0.0
Overall risk measure for model RF_12_200:  0.00015458111237839916


Test 1 (TVD, A1=high) hazard:  0.09085147869902101
Test 2 (TVD, A1=low) hazard:  0.014801760181546463
Test 3 (JS, A1=high) hazard:  1.52371151916934e-06
Test 4 (JS, A1=low) hazard:  0.0
Overall risk measure for model RF_37_10:  0.00013193466737245095


Test 1 (TVD, A1=high) hazard:  0.03355105791241006
Test 2 (TVD, A1=low) hazard:  0.006384298991373367
Test 3 (JS, A1=high) hazard:  0.00014383137973982362
Test 4 (JS, A1=low) hazard:  0.0
Overall risk measure for model Tree_depth2:  5.113018082906323e-05




In [42]:
def test_model_2(data_frame):
    ### Test 1: TVD, A1=high

    bd_1 = FreqVsFreqBiasDetector(distance="TVD", A1="high")

    overall_1 = bd_1.compare_root_variable_groups(
        dataframe=data_frame,
        target_variable='predictions',
        root_variable='x2_sex')

    conditioned_1 = bd_1.compare_root_variable_conditioned_groups(
        dataframe=data_frame,
        target_variable='predictions',
        root_variable='x2_sex',
        conditioning_variables=conditioning_variables)

    hazard_test_1, average_threshold_1 = hazard_function_2(
        overall_1, 
        conditioned_1, 
        data_frame.shape[0])
    
    print("Test 1 (TVD, A1=high) hazard: ", hazard_test_1)

    ### Test 2 (TVD, low)

    bd_2 = FreqVsFreqBiasDetector(distance="TVD", A1="low")

    overall_2 = bd_2.compare_root_variable_groups(
        dataframe=data_frame,
        target_variable='predictions',
        root_variable='x2_sex')

    conditioned_2 = bd_2.compare_root_variable_conditioned_groups(
        dataframe=data_frame,
        target_variable='predictions',
        root_variable='x2_sex',
        conditioning_variables=conditioning_variables)

    hazard_test_2, average_threshold_2 = hazard_function_2(
        overall_2, 
        conditioned_2, 
        data_frame.shape[0])
    
    print("Test 2 (TVD, A1=low) hazard: ", hazard_test_2)

    ### Test 3 (JS, high)

    bd_3 = FreqVsFreqBiasDetector(distance="JS", A1="high")

    overall_3 = bd_3.compare_root_variable_groups(
        dataframe=data_frame,
        target_variable='predictions',
        root_variable='x2_sex')

    conditioned_3 = bd_3.compare_root_variable_conditioned_groups(
        dataframe=data_frame,
        target_variable='predictions',
        root_variable='x2_sex',
        conditioning_variables=conditioning_variables)

    hazard_test_3, average_threshold_3 = hazard_function_2(
        overall_3, 
        conditioned_3, 
        data_frame.shape[0])
    
    print("Test 3 (JS, A1=high) hazard: ", hazard_test_3)

    ### Test 4 (JS, low)

    bd_4 = FreqVsFreqBiasDetector(distance="JS", A1="low")

    overall_4 = bd_4.compare_root_variable_groups(
        dataframe=data_frame,
        target_variable='predictions',
        root_variable='x2_sex')

    conditioned_4 = bd_4.compare_root_variable_conditioned_groups(
        dataframe=data_frame,
        target_variable='predictions',
        root_variable='x2_sex',
        conditioning_variables=conditioning_variables)

    hazard_test_4, average_threshold_4 = hazard_function_2(
        overall_4, 
        conditioned_4, 
        data_frame.shape[0])
    
    print("Test 4 (JS, A1=low) hazard: ", hazard_test_4)
    
    hazards = [hazard_test_1, hazard_test_2, hazard_test_3, hazard_test_4]
    average_thresholds = [average_threshold_1, 
                      average_threshold_2, average_threshold_3, average_threshold_4]
    
    return risk_function_2(hazards, average_thresholds)

In [43]:
for model, df in zip(["RF_12_200", "RF_37_10", "Tree_depth2"],
              [df_with_predictions_1, df_with_predictions_2, df_with_predictions_3]):
    print(f"Overall risk measure for model {model}: ", test_model_2(df))
    print("\n")

Test 1 (TVD, A1=high) hazard:  0.0016526941966088404
Test 2 (TVD, A1=low) hazard:  0.0007892524565281126
Test 3 (JS, A1=high) hazard:  1.956457231838417e-06
Test 4 (JS, A1=low) hazard:  0.0
Overall risk measure for model RF_12_200:  0.00015274394439804945


Test 1 (TVD, A1=high) hazard:  0.0015049775936835396
Test 2 (TVD, A1=low) hazard:  0.0005801440913963854
Test 3 (JS, A1=high) hazard:  2.6739671511961216e-08
Test 4 (JS, A1=low) hazard:  0.0
Overall risk measure for model RF_37_10:  0.0001303217765469648


Test 1 (TVD, A1=high) hazard:  0.0005595555272103964
Test 2 (TVD, A1=low) hazard:  0.00025096748221272835
Test 3 (JS, A1=high) hazard:  2.4731412590394615e-06
Test 4 (JS, A1=low) hazard:  0.0
Overall risk measure for model Tree_depth2:  5.081225941763526e-05




In [41]:
for model, df in zip(["RF_12_200", "RF_37_10", "Tree_depth2"],
              [df_with_predictions_1, df_with_predictions_2, df_with_predictions_3]):
    print(f"Overall risk measure for model {model}: ", test_model(df))
    print("\n")

Test 1 (TVD, A1=high) hazard:  0.09976465885178575
Test 2 (TVD, A1=low) hazard:  0.02015304845287825
Test 3 (JS, A1=high) hazard:  0.00011458216656976323
Test 4 (JS, A1=low) hazard:  0.0
Overall risk measure for model RF_12_200:  0.00015458111237839916


Test 1 (TVD, A1=high) hazard:  0.09085147869902101
Test 2 (TVD, A1=low) hazard:  0.014801760181546463
Test 3 (JS, A1=high) hazard:  1.52371151916934e-06
Test 4 (JS, A1=low) hazard:  0.0
Overall risk measure for model RF_37_10:  0.00013193466737245095


Test 1 (TVD, A1=high) hazard:  0.03355105791241006
Test 2 (TVD, A1=low) hazard:  0.006384298991373367
Test 3 (JS, A1=high) hazard:  0.00014383137973982362
Test 4 (JS, A1=low) hazard:  0.0
Overall risk measure for model Tree_depth2:  5.113018082906323e-05




In [46]:
bd_ref = FreqVsRefBiasDetector()

In [54]:
male_0_ref = 55/100
male_1_ref = 45/100

female_0_ref = 50/100
female_1_ref = 50/100

ref_distribution = [np.array([female_0_ref, female_1_ref]), np.array([male_0_ref, male_1_ref])]

In [55]:
overall_ref = bd_ref.compare_root_variable_groups(
    dataframe=df_with_predictions,
    target_variable='predictions',
    root_variable='x2_sex',
    #threshold=0.1,
    reference_distribution=ref_distribution
)

In [56]:
overall_ref

([0.3708341585642676, 0.3668290639264449],
 [False, False],
 0.016368585412256314)

In [57]:
conditioned_ref = bd_ref.compare_root_variable_conditioned_groups(
    dataframe=df_with_predictions,
    target_variable='predictions',
    root_variable='x2_sex',
    conditioning_variables=conditioning_variables,
    #threshold=0.1,
    min_obs_per_group=30,
    reference_distribution=ref_distribution)

In [58]:
conditioned_ref

{'x3_education==1': (3119,
  [0.4620223006815293, 0.456287468022177],
  [False, False],
  0.016451439592896744),
 'x3_education==3': (1499,
  [0.30004054728107643, 0.301920000109016],
  [False, False],
  0.016540570623988414),
 'x3_education==2': (4250,
  [0.3297513070249223, 0.32991040581610154],
  [False, False],
  0.016422567122490656),
 'x3_education==4': (40, [1.0, 1.0], [False, False], 0.01802878118384471),
 'x3_education==5': (75,
  [0.45277540525442184, 0.6135760511412283],
  [False, False],
  0.017549038105676658),
 'x3_education==6': (14, None, 'Not enough observations'),
 'x3_education==0': (3, None, 'Not enough observations'),
 'x4_marriage==1': (4065,
  [0.376452060901446, 0.35992113661821357],
  [False, False],
  0.016426450237593673),
 'x4_marriage==2': (4822,
  [0.36513728387900557, 0.3741329246698629],
  [False, False],
  0.01641200891721621),
 'x4_marriage==3': (95,
  [0.44397106958324817, 0.2777694169405729],
  [False, False],
  0.017404225646095797),
 'x4_marriage==

In [None]:
# TODO considerare i risultati nei sottogruppi come test indipendenti (fare ciclo for)