In [1]:
import os, sys
dir2 = os.path.abspath('')
dir1 = os.path.dirname(dir2)
if not dir1 in sys.path: sys.path.append(dir1)

In [2]:
from src.data_processing.Preprocessing import Preprocessing
from src.bias.BiasDetector import BiasDetector
from sklearn.model_selection import train_test_split
import numpy as np
from src.bias.Distance import Distance
from pickle import dump, load
import pandas as pd

In [3]:
input_data_path = "../data/raw_data/uci-default-of-credit-card/data/data.csv"


In [4]:
with open("mlruns/1/1e4a0667c7a64cbe8c7b023410e5781c/artifacts/model/model.pkl", "rb") as file:
    classifier = load(file)

In [5]:
pp = Preprocessing(input_data_path, "default")
X, Y = pp.read_dataframe()

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.3, random_state=420)


#preprocessing ohe
shapes_pre = (X_train.shape[0], X_test.shape[0])
X_train_ohe, ohe, scaler = pp.preprocess_for_classification(df=X_train, fit_ohe=True, perform_scaling=True)
X_test_ohe, _, _ = pp.preprocess_for_classification(df=X_test, 
                                                    fit_ohe=True, 
                                                    fitted_ohe=ohe,
                                                    perform_scaling=True,
                                                    fitted_scaler=scaler)

In [6]:
predicted_prob = classifier.predict_proba(X_test_ohe)
predicted_values = classifier.predict(X_test_ohe)

In [7]:
df_with_predictions = pd.concat(
    [X_test.reset_index(drop=True), pd.Series(predicted_values)], axis=1).rename(columns={0:"predictions"})

In [8]:
predicted_ones = df_with_predictions.loc[
    df_with_predictions.predictions==1
]

In [9]:
predicted_zeroes = df_with_predictions.loc[
    df_with_predictions.predictions==0
]

In [10]:
dis = Distance()

In [11]:
# 2: Gender (1 = male; 2 = female).',
# '3: Education (1 = graduate school; 2 = university; 3 = high school; 4 = others).',


In [12]:
gender_freq_zeros = np.array(
    predicted_zeroes.x2_sex.value_counts(normalize=True)
)

In [13]:
predicted_zeroes.x2_sex.value_counts()

2    4897
1    3103
Name: x2_sex, dtype: int64

In [14]:
gender_freq_ones = np.array(
    predicted_ones.x2_sex.value_counts(normalize=True)
)

In [15]:
# [female_0, male_0], [female_1, male_1]
freq_list = [ 
    np.array([gender_freq_zeros[0], gender_freq_zeros[1]]), 
    np.array([gender_freq_ones[0], gender_freq_ones[1]]) 
]

In [16]:
freq_list

[array([0.612125, 0.387875]), array([0.551, 0.449])]

In [17]:
dis.compute_distance_between_frequencies(freq_list)

0.061124999999999985

In [18]:
# test della funzione da BiasDetector
bd = BiasDetector()

In [19]:
bd.compare_binary_variable_conditioned_groups(df_with_predictions,
                                              'predictions',
                                              'x2_sex',
                                              'x3_education',
                                              0.1)

{1: (0.06643323350994396, True),
 3: (0.06406963470319638, True),
 2: (0.062267282715836814, True),
 4: (0.6, False),
 5: (0.37500000000000006, False),
 6: (0.08333333333333337, True),
 0: (1.0, False)}

In [20]:
bd.compare_binary_variable_conditioned_groups(df_with_predictions,
                                              'predictions',
                                              'x2_sex',
                                              'x4_marriage',
                                              0.1)

{1: (0.04541704252931733, True),
 2: (0.07753412294068796, True),
 3: (0.11144578313253017, False),
 0: (0.8333333333333334, False)}

In [21]:
X.columns

Index(['id', 'x1_limit_bal', 'x2_sex', 'x3_education', 'x4_marriage', 'x5_age',
       'x6_pay_0', 'x7_pay_2', 'x8_pay_3', 'x9_pay_4', 'x10_pay_5',
       'x11_pay_6', 'x12_bill_amt1', 'x13_bill_amt2', 'x14_bill_amt3',
       'x15_bill_amt4', 'x16_bill_amt5', 'x17_bill_amt6', 'x18_pay_amt1',
       'x19_pay_amt2', 'x20_pay_amt3', 'x21_pay_amt4', 'x22_pay_amt5',
       'x23_pay_amt6', 'y_default_payment_next_month'],
      dtype='object')

In [22]:
bd.compare_binary_variable_conditioned_groups(df_with_predictions,
                                              'predictions',
                                              'x2_sex',
                                              'x4_marriage',
                                              0.1)

{1: (0.04541704252931733, True),
 2: (0.07753412294068796, True),
 3: (0.11144578313253017, False),
 0: (0.8333333333333334, False)}

In [25]:
v1 = 'x2_sex'
v2 = 'x4_marriage'

v1_labels = df_with_predictions[v1].unique()
v2_labels = df_with_predictions[v2].unique()

In [31]:
f'{v1}=={v1_labels[0]} & {v2}=={v2_labels[0]}'

'x2_sex==2 & x4_marriage==1'

In [32]:
from sklearn.utils.extmath import cartesian

In [37]:
cc = cartesian([v1_labels, v2_labels])

In [41]:
dataframe = df_with_predictions

In [40]:
for c in cc:
    print(f'{v1}=={c[0]} & {v2}=={c[1]}')

x2_sex==2 & x4_marriage==1
x2_sex==2 & x4_marriage==2
x2_sex==2 & x4_marriage==3
x2_sex==2 & x4_marriage==0
x2_sex==1 & x4_marriage==1
x2_sex==1 & x4_marriage==2
x2_sex==1 & x4_marriage==3
x2_sex==1 & x4_marriage==0


In [43]:
c

array([2, 1])

In [67]:
conditioning_variables = ['x2_sex', 'x4_marriage']
combinations = cartesian([dataframe[v].unique() for v in conditioning_variables])
for comb in combinations:
    condition = " & ".join(
        [f'{conditioning_variables[i[0]]}=={i[1]}' for i in enumerate(comb)]
    )
    dataframe_subset = dataframe.query(condition)

In [69]:
condition

'x2_sex==1 & x4_marriage==0'

In [68]:
combinations

array([[2, 1],
       [2, 2],
       [2, 3],
       [2, 0],
       [1, 1],
       [1, 2],
       [1, 3],
       [1, 0]])

In [61]:
dataframe_subset

Unnamed: 0,id,x1_limit_bal,x2_sex,x3_education,x4_marriage,x5_age,x6_pay_0,x7_pay_2,x8_pay_3,x9_pay_4,...,x16_bill_amt5,x17_bill_amt6,x18_pay_amt1,x19_pay_amt2,x20_pay_amt3,x21_pay_amt4,x22_pay_amt5,x23_pay_amt6,y_default_payment_next_month,predictions
7113,3057,200000,1,3,0,36,-2,-2,-2,-2,...,9140,0,89187,14178,41359,9140,0,0,False,0
7513,11926,140000,1,3,0,47,0,0,0,0,...,49047,38796,2536,5017,10012,10013,3010,6015,False,0
8606,9089,50000,1,3,0,50,0,0,0,-1,...,2769,0,2266,0,3400,2769,1700,0,False,0


In [54]:
for i in enumerate(combinations[0]):
    print(i)

(0, 2)
(1, 1)


In [52]:
list(enumerate(combinations))

[(0, array([2, 1])),
 (1, array([2, 2])),
 (2, array([2, 3])),
 (3, array([2, 0])),
 (4, array([1, 1])),
 (5, array([1, 2])),
 (6, array([1, 3])),
 (7, array([1, 0]))]

In [71]:
aaa = df_with_predictions.query("x2_sex==2 & x3_education==2")

In [73]:
bd.get_frequencies_list(aaa, 'predictions', 'x2_sex', df_with_predictions.x2_sex.unique())

[array([1., 0.]), array([1., 0.])]

In [74]:
aaa.predictions.value_counts()

0    2345
1     315
Name: predictions, dtype: int64

In [81]:
[1, 2, 3][:3]

[1, 2, 3]