In [1]:
import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.model_selection import RepeatedStratifiedKFold

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.preprocessing import StandardScaler

from aif360.sklearn.datasets import standardize_dataset
from aif360.sklearn.metrics import statistical_parity_difference, disparate_impact_ratio,\
                                   equal_opportunity_difference, average_odds_difference, \
                                   generalized_entropy_index, theil_index
from aif360.sklearn.preprocessing import Reweighing
from aif360.sklearn.postprocessing import CalibratedEqualizedOdds, PostProcessingMeta, RejectOptionClassifier



pip install 'aif360[Reductions]'
pip install 'aif360[Reductions]'
pip install 'aif360[inFairness]'
pip install 'aif360[Reductions]'
pip install 'aif360[OptimalTransport]'
pip install 'aif360[FairAdapt]'


In [2]:
loandata = pd.read_csv('../LoanData_prepared.csv')

In [3]:
df = pd.DataFrame(loandata)
df.shape

(241909, 21)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 241909 entries, 0 to 241908
Data columns (total 21 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   AppliedAmount                      241909 non-null  float64
 1   DebtToIncome                       241909 non-null  float64
 2   AmountOfPreviousLoansBeforeLoan    241909 non-null  float64
 3   Country                            241909 non-null  object 
 4   Education                          241909 non-null  object 
 5   EmploymentDurationCurrentEmployer  241909 non-null  object 
 6   ExistingLiabilities                241909 non-null  int64  
 7   FreeCash                           241909 non-null  float64
 8   Gender                             241909 non-null  object 
 9   HomeOwnershipType                  241909 non-null  object 
 10  IncomeTotal                        241909 non-null  float64
 11  Interest                           2419

In [5]:
df.isnull().sum()

AppliedAmount                        0
DebtToIncome                         0
AmountOfPreviousLoansBeforeLoan      0
Country                              0
Education                            0
EmploymentDurationCurrentEmployer    0
ExistingLiabilities                  0
FreeCash                             0
Gender                               0
HomeOwnershipType                    0
IncomeTotal                          0
Interest                             0
LiabilitiesTotal                     0
LoanDuration                         0
MonthlyPayment                       0
NewCreditCustomer                    0
NoOfPreviousLoansBeforeLoan          0
VerificationType                     0
LanguageCode                         0
Default                              0
Age_Group                            0
dtype: int64

In [6]:
protected = ['Education_Basic education']

In [7]:
priv_group_dict = {'Education_Basic education':1}

# Fair Model Training

In [8]:
df = pd.get_dummies(df)

X, y = standardize_dataset(df, prot_attr=protected, target='Default')

#Initialize Reweighing
RW = Reweighing(prot_attr=protected)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)

# Do the reweighing on the data
# Sample weights are saved in new variable sample_weights!
X_train, sample_weights = RW.fit_transform(X_train, y_train)

scaler=StandardScaler().set_output(transform='pandas')

# Select numerical features from training data
numerical_features = X_train.select_dtypes(include=np.number).columns.tolist()

# Fit scaler to training data
scaler.fit(X_train[numerical_features])

# Transform training and testing data using scaler
X_train = scaler.transform(X_train[numerical_features])
X_test = scaler.transform(X_test[numerical_features])

In [9]:
X

Unnamed: 0_level_0,AppliedAmount,DebtToIncome,AmountOfPreviousLoansBeforeLoan,ExistingLiabilities,FreeCash,IncomeTotal,Interest,LiabilitiesTotal,LoanDuration,MonthlyPayment,...,LanguageCode_Finnish,LanguageCode_German,LanguageCode_Russian,LanguageCode_Slovakian,LanguageCode_Spanish,Age_Group_18-30,Age_Group_31-40,Age_Group_41-50,Age_Group_51-60,Age_Group_61-70
Education_Basic education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
False,3189.0,0.00,3402.0,0,0.00,900.0,35.48,0.00,60,115.34,...,False,False,False,False,False,True,False,False,False,False
False,4146.0,0.00,518.0,0,0.00,3100.0,19.86,0.00,60,108.17,...,True,False,False,False,False,False,True,False,False,False
False,531.0,0.00,1593.0,0,0.00,1700.0,19.52,0.00,60,15.25,...,False,False,False,False,False,True,False,False,False,False
False,2125.0,26.29,500.0,8,10.92,354.0,20.97,485.09,60,62.05,...,False,False,False,False,False,False,False,False,True,False
False,518.0,0.00,19283.0,0,0.00,4800.0,19.88,0.00,60,13.51,...,True,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
False,3000.0,25.35,0.0,3,11.29,350.0,21.62,359.00,60,88.71,...,False,False,True,False,False,False,False,False,True,False
False,1300.0,56.59,8000.0,8,116.38,867.0,16.38,804.20,48,41.42,...,False,False,False,False,False,False,False,False,False,True
False,4146.0,0.00,4146.0,0,0.00,2600.0,19.86,0.00,60,108.17,...,True,False,False,False,False,False,True,False,False,False
False,531.0,0.00,6373.0,0,0.00,900.0,16.22,0.00,60,14.44,...,False,False,True,False,False,False,True,False,False,False


## Reweighing pre- & in-processing

In [10]:
for p in protected:
      print('Attribute', p)
      print('Statistical Parity Difference (SPD):\t\t\t%.4f' %\
            statistical_parity_difference(y_train, prot_attr=p,  priv_group=priv_group_dict[p], sample_weight=sample_weights))
      print('Disparate Impact (DI):\t\t\t\t\t%.4f' %\
            disparate_impact_ratio(y_train, prot_attr=p, priv_group=priv_group_dict[p], sample_weight=sample_weights))


Attribute Education_Basic education
Statistical Parity Difference (SPD):			0.0000
Disparate Impact (DI):					1.0000


In [11]:
BAG = BaggingClassifier(n_estimators=50, estimator=RandomForestClassifier(n_estimators=25, max_depth=35,random_state=42), random_state=42, warm_start=True, verbose=2)

BAG.fit(X_train, y_train, sample_weight=sample_weights)
print("Out: ",accuracy_score(y_test, BAG.predict(X_test)))
print("In: ",accuracy_score(y_train, BAG.predict(X_train)))


Building estimator 1 of 50 for this parallel run (total 50)...


Building estimator 2 of 50 for this parallel run (total 50)...
Building estimator 3 of 50 for this parallel run (total 50)...
Building estimator 4 of 50 for this parallel run (total 50)...
Building estimator 5 of 50 for this parallel run (total 50)...
Building estimator 6 of 50 for this parallel run (total 50)...
Building estimator 7 of 50 for this parallel run (total 50)...
Building estimator 8 of 50 for this parallel run (total 50)...
Building estimator 9 of 50 for this parallel run (total 50)...
Building estimator 10 of 50 for this parallel run (total 50)...
Building estimator 11 of 50 for this parallel run (total 50)...
Building estimator 12 of 50 for this parallel run (total 50)...
Building estimator 13 of 50 for this parallel run (total 50)...
Building estimator 14 of 50 for this parallel run (total 50)...
Building estimator 15 of 50 for this parallel run (total 50)...
Building estimator 16 of 50 for this parallel run (total 50)...
Building estimator 17 of 50 for this parallel ru

In [12]:
y_pred = BAG.predict(X_test)

In [13]:
# Generate a classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)

Classification Report:
               precision    recall  f1-score   support

           0       0.69      0.75      0.72     14033
           1       0.61      0.54      0.57     10158

    accuracy                           0.66     24191
   macro avg       0.65      0.64      0.64     24191
weighted avg       0.66      0.66      0.66     24191



In [14]:
y_test_np = y_test.to_numpy()

b = np.array([y_pred[i] - y_test_np[i] + 1 for i in range(len(y_pred) - 1)])

print(b)

[1 1 2 ... 1 0 1]


In [15]:
SPD = []
DI = []
EqualOpp = []
AverageOdds = []
GEI = []
Theil = []

for attr in protected:
    print(attr)
    spd_score = statistical_parity_difference(y_test, y_pred, prot_attr=attr, priv_group=priv_group_dict[attr])
    SPD.append(spd_score)
    di_score = disparate_impact_ratio(y_test, y_pred, prot_attr=attr, priv_group=priv_group_dict[attr])
    DI.append(di_score)
    equalopp = equal_opportunity_difference(y_test, y_pred, prot_attr=attr, priv_group=priv_group_dict[attr])
    EqualOpp.append(equalopp)
    averageodd = average_odds_difference(y_test, y_pred, prot_attr=attr, priv_group=priv_group_dict[attr])
    AverageOdds.append(averageodd)
    gei = generalized_entropy_index(b=b, alpha=2)
    GEI.append(gei)
    theil = generalized_entropy_index(b=b, alpha=1)
    Theil.append(theil)


df_bias = pd.DataFrame({'Protected_feature':protected,'Statistical_Parity':SPD,'Disparate_Impact':DI, 'Equal Opportunity difference':EqualOpp, 'Equalized Odds difference': AverageOdds, 'GEI':GEI, 'Theil':Theil})
df_bias['DI_normal']=df_bias["Disparate_Impact"].apply(lambda x: 1/x if x < 1 else x)
df_bias['SPD_normal']=df_bias["Statistical_Parity"].apply(lambda x: abs(x) if x < 0 else x)
df_bias['EoppD_normal']=df_bias["Equal Opportunity difference"].apply(lambda x: abs(x) if x < 0 else x)
df_bias['EoddsD_normal']=df_bias["Equalized Odds difference"].apply(lambda x: abs(x) if x < 0 else x)

Education_Basic education


In [16]:
df_bias.loc[:,['Protected_feature', 'DI_normal', 'SPD_normal', 'EoppD_normal', 'EoddsD_normal', 'GEI', 'Theil']].head(10)

Unnamed: 0,Protected_feature,DI_normal,SPD_normal,EoppD_normal,EoddsD_normal,GEI,Theil
0,Education_Basic education,1.91723,0.179918,0.080412,0.12073,0.186555,0.262097


## Initialize model

In [17]:
ROC = RejectOptionClassifier(prot_attr=protected, threshold=0.1, margin=0.1)

postproc = PostProcessingMeta(estimator=BaggingClassifier(n_estimators=50, estimator=RandomForestClassifier(n_estimators=25, max_depth=35,random_state=42), random_state=42, warm_start=True, verbose=2), 
                              postprocessor=ROC, random_state=42, val_size=0.1)

postproc.fit(X_train, y_train)
print("Out: ",accuracy_score(y_test, postproc.predict(X_test)))
print("In: ",accuracy_score(y_train, postproc.predict(X_train)))


Building estimator 1 of 50 for this parallel run (total 50)...
Building estimator 2 of 50 for this parallel run (total 50)...
Building estimator 3 of 50 for this parallel run (total 50)...
Building estimator 4 of 50 for this parallel run (total 50)...
Building estimator 5 of 50 for this parallel run (total 50)...
Building estimator 6 of 50 for this parallel run (total 50)...
Building estimator 7 of 50 for this parallel run (total 50)...
Building estimator 8 of 50 for this parallel run (total 50)...
Building estimator 9 of 50 for this parallel run (total 50)...
Building estimator 10 of 50 for this parallel run (total 50)...
Building estimator 11 of 50 for this parallel run (total 50)...
Building estimator 12 of 50 for this parallel run (total 50)...
Building estimator 13 of 50 for this parallel run (total 50)...
Building estimator 14 of 50 for this parallel run (total 50)...
Building estimator 15 of 50 for this parallel run (total 50)...
Building estimator 16 of 50 for this parallel run

In [18]:
y_pred = postproc.predict(X_test)

In [19]:
# Generate a classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.01      0.03     14033
           1       0.42      1.00      0.59     10158

    accuracy                           0.43     24191
   macro avg       0.67      0.51      0.31     24191
weighted avg       0.71      0.43      0.26     24191



In [20]:
y_test_np = y_test.to_numpy()

b = np.array([y_pred[i] - y_test_np[i] + 1 for i in range(len(y_pred) - 1)])

print(b)

[2 2 2 ... 1 1 2]


In [21]:
SPD = []
DI = []
EqualOpp = []
AverageOdds = []
GEI = []
Theil = []

for attr in protected:
    print(attr)
    spd_score = statistical_parity_difference(y_test, y_pred, prot_attr=attr, priv_group=priv_group_dict[attr])
    SPD.append(spd_score)
    di_score = disparate_impact_ratio(y_test, y_pred, prot_attr=attr, priv_group=priv_group_dict[attr])
    DI.append(di_score)
    equalopp = equal_opportunity_difference(y_test, y_pred, prot_attr=attr, priv_group=priv_group_dict[attr])
    EqualOpp.append(equalopp)
    averageodd = average_odds_difference(y_test, y_pred, prot_attr=attr, priv_group=priv_group_dict[attr])
    AverageOdds.append(averageodd)
    gei = generalized_entropy_index(b=b, alpha=2)
    GEI.append(gei)
    theil = generalized_entropy_index(b=b, alpha=1)
    Theil.append(theil)


Education_Basic education


In [22]:
df_bias = pd.DataFrame({'Protected_feature':protected,'Statistical_Parity':SPD,'Disparate_Impact':DI, 'Equal Opportunity difference':EqualOpp, 'Equalized Odds difference': AverageOdds, 'GEI':GEI, 'Theil':Theil})
df_bias['DI_normal']=df_bias["Disparate_Impact"].apply(lambda x: 1/x if x < 1 else x)
df_bias['SPD_normal']=df_bias["Statistical_Parity"].apply(lambda x: abs(x) if x < 0 else x)
df_bias['EoppD_normal']=df_bias["Equal Opportunity difference"].apply(lambda x: abs(x) if x < 0 else x)
df_bias['EoddsD_normal']=df_bias["Equalized Odds difference"].apply(lambda x: abs(x) if x < 0 else x)


In [23]:
df_bias.loc[:,['Protected_feature', 'DI_normal', 'SPD_normal', 'EoppD_normal', 'EoddsD_normal', 'GEI', 'Theil']].head(10)

Unnamed: 0,Protected_feature,DI_normal,SPD_normal,EoppD_normal,EoddsD_normal,GEI,Theil
0,Education_Basic education,1.61477,0.380685,0.104167,0.295361,0.049803,0.05264
