In [1]:
import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.model_selection import RepeatedStratifiedKFold

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.preprocessing import StandardScaler

from aif360.sklearn.datasets import standardize_dataset
from aif360.sklearn.metrics import statistical_parity_difference, disparate_impact_ratio,\
                                   equal_opportunity_difference, average_odds_difference, \
                                   generalized_entropy_index, theil_index


pip install 'aif360[Reductions]'
pip install 'aif360[Reductions]'
pip install 'aif360[inFairness]'
pip install 'aif360[Reductions]'
pip install 'aif360[OptimalTransport]'


In [2]:
loandata = pd.read_csv('../LoanData_prepared.csv')

In [3]:
df = pd.DataFrame(loandata)
df.shape

(241909, 21)

In [4]:
df.isnull().sum()

AppliedAmount                        0
DebtToIncome                         0
AmountOfPreviousLoansBeforeLoan      0
Country                              0
Education                            0
EmploymentDurationCurrentEmployer    0
ExistingLiabilities                  0
FreeCash                             0
Gender                               0
HomeOwnershipType                    0
IncomeTotal                          0
Interest                             0
LiabilitiesTotal                     0
LoanDuration                         0
MonthlyPayment                       0
NewCreditCustomer                    0
NoOfPreviousLoansBeforeLoan          0
VerificationType                     0
LanguageCode                         0
Default                              0
Age_Group                            0
dtype: int64

In [5]:
protected = ['Gender_Undefined', 'Country_EE', 'Country_FI', 'Country_ES', 'Age_Group_61-70', 'Education_Basic education', 'Education_Primary education', 'LanguageCode_Estonian',
          'LanguageCode_Finnish', 'LanguageCode_Spanish']

In [6]:
priv_group_dict = {'Gender_Undefined':0,
                   'Country_EE':1,
                   'Country_FI':0,
                   'Country_ES':0,
                   'Age_Group_61-70':0,
                   'Education_Basic education':1,
                   'Education_Primary education':0,
                   'LanguageCode_Estonian':1,
                   'LanguageCode_Finnish':0,
                   'LanguageCode_Spanish':0}

# Baseline Model Training

In [7]:
df = pd.get_dummies(df)

X, y = standardize_dataset(df, prot_attr=protected, target='Default')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)

scaler=StandardScaler()

# Select numerical features from training data
numerical_features = X_train.select_dtypes(include=np.number).columns.tolist()

# Fit scaler to training data
scaler.fit(X_train[numerical_features])

# Transform training and testing data using scaler
X_train = scaler.transform(X_train[numerical_features])
X_test = scaler.transform(X_test[numerical_features])

In [8]:
X

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,AppliedAmount,DebtToIncome,AmountOfPreviousLoansBeforeLoan,ExistingLiabilities,FreeCash,IncomeTotal,Interest,LiabilitiesTotal,LoanDuration,MonthlyPayment,...,LanguageCode_Finnish,LanguageCode_German,LanguageCode_Russian,LanguageCode_Slovakian,LanguageCode_Spanish,Age_Group_18-30,Age_Group_31-40,Age_Group_41-50,Age_Group_51-60,Age_Group_61-70
Gender_Undefined,Country_EE,Country_FI,Country_ES,Age_Group_61-70,Education_Basic education,Education_Primary education,LanguageCode_Estonian,LanguageCode_Finnish,LanguageCode_Spanish,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
False,True,False,False,False,False,False,True,False,False,3189.0,0.00,3402.0,0,0.00,900.0,35.48,0.00,60,115.34,...,False,False,False,False,False,True,False,False,False,False
False,False,True,False,False,False,False,False,True,False,4146.0,0.00,518.0,0,0.00,3100.0,19.86,0.00,60,108.17,...,True,False,False,False,False,False,True,False,False,False
False,True,False,False,False,False,False,True,False,False,531.0,0.00,1593.0,0,0.00,1700.0,19.52,0.00,60,15.25,...,False,False,False,False,False,True,False,False,False,False
False,True,False,False,False,False,False,True,False,False,2125.0,26.29,500.0,8,10.92,354.0,20.97,485.09,60,62.05,...,False,False,False,False,False,False,False,False,True,False
False,False,True,False,False,False,False,False,True,False,518.0,0.00,19283.0,0,0.00,4800.0,19.88,0.00,60,13.51,...,True,False,False,False,False,False,True,False,False,False
False,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
False,True,False,False,False,False,False,False,False,False,3000.0,25.35,0.0,3,11.29,350.0,21.62,359.00,60,88.71,...,False,False,True,False,False,False,False,False,True,False
False,True,False,False,True,False,False,True,False,False,1300.0,56.59,8000.0,8,116.38,867.0,16.38,804.20,48,41.42,...,False,False,False,False,False,False,False,False,False,True
False,False,True,False,False,False,False,False,True,False,4146.0,0.00,4146.0,0,0.00,2600.0,19.86,0.00,60,108.17,...,True,False,False,False,False,False,True,False,False,False
False,True,False,False,False,False,False,False,False,False,531.0,0.00,6373.0,0,0.00,900.0,16.22,0.00,60,14.44,...,False,False,True,False,False,False,True,False,False,False


In [9]:
BAG = BaggingClassifier(n_estimators=50, estimator=RandomForestClassifier(n_estimators=25, max_depth=35,random_state=42), random_state=42, warm_start=True, verbose=2)


BAG.fit(X_train, y_train) 
y_pred = BAG.predict(X_test)

Building estimator 1 of 50 for this parallel run (total 50)...
Building estimator 2 of 50 for this parallel run (total 50)...
Building estimator 3 of 50 for this parallel run (total 50)...
Building estimator 4 of 50 for this parallel run (total 50)...
Building estimator 5 of 50 for this parallel run (total 50)...
Building estimator 6 of 50 for this parallel run (total 50)...
Building estimator 7 of 50 for this parallel run (total 50)...
Building estimator 8 of 50 for this parallel run (total 50)...
Building estimator 9 of 50 for this parallel run (total 50)...
Building estimator 10 of 50 for this parallel run (total 50)...
Building estimator 11 of 50 for this parallel run (total 50)...
Building estimator 12 of 50 for this parallel run (total 50)...
Building estimator 13 of 50 for this parallel run (total 50)...
Building estimator 14 of 50 for this parallel run (total 50)...
Building estimator 15 of 50 for this parallel run (total 50)...
Building estimator 16 of 50 for this parallel run

In [10]:
print('In-Sample Accuracy:', BAG.score(X_train, y_train))
print('Out-of-Sample Accuracy:', BAG.score(X_test, y_test))

In-Sample Accuracy: 0.9653267070246833
Out-of-Sample Accuracy: 0.6589227398619322


In [11]:
# Generate a classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)

Classification Report:
               precision    recall  f1-score   support

           0       0.69      0.75      0.72     14033
           1       0.61      0.54      0.57     10158

    accuracy                           0.66     24191
   macro avg       0.65      0.64      0.64     24191
weighted avg       0.65      0.66      0.66     24191



In [12]:
y_test_np = y_test.to_numpy()

b = np.array([y_pred[i] - y_test_np[i] + 1 for i in range(len(y_pred) - 1)])

print(b)

[1 1 2 ... 1 0 1]


In [13]:
SPD = []
DI = []
EqualOpp = []
AverageOdds = []
GEI = []
Theil = []

for attr in protected:
    print(attr)
    spd_score = statistical_parity_difference(y_test, y_pred, prot_attr=attr, priv_group=priv_group_dict[attr])
    SPD.append(spd_score)
    di_score = disparate_impact_ratio(y_test, y_pred, prot_attr=attr, priv_group=priv_group_dict[attr])
    DI.append(di_score)
    equalopp = equal_opportunity_difference(y_test, y_pred, prot_attr=attr, priv_group=priv_group_dict[attr])
    EqualOpp.append(equalopp)
    averageodd = average_odds_difference(y_test, y_pred, prot_attr=attr, priv_group=priv_group_dict[attr])
    AverageOdds.append(averageodd)
    gei = generalized_entropy_index(b=b, alpha=2)
    GEI.append(gei)
    theil = generalized_entropy_index(b=b, alpha=1)
    Theil.append(theil)


Gender_Undefined
Country_EE
Country_FI
Country_ES
Age_Group_61-70
Education_Basic education
Education_Primary education
LanguageCode_Estonian
LanguageCode_Finnish
LanguageCode_Spanish


In [14]:
df_bias = pd.DataFrame({'Protected_feature':protected,'Statistical_Parity':SPD,'Disparate_Impact':DI, 'Equal Opportunity difference':EqualOpp, 'Equalized Odds difference': AverageOdds, 'GEI':GEI, 'Theil':Theil})
df_bias['DI_normal']=df_bias["Disparate_Impact"].apply(lambda x: 1/x if x < 1 else x)
df_bias['SPD_normal']=df_bias["Statistical_Parity"].apply(lambda x: abs(x) if x < 0 else x)
df_bias['EoppD_normal']=df_bias["Equal Opportunity difference"].apply(lambda x: abs(x) if x < 0 else x)
df_bias['EoddsD_normal']=df_bias["Equalized Odds difference"].apply(lambda x: abs(x) if x < 0 else x)


In [15]:
df_bias.loc[:,['Protected_feature', 'DI_normal', 'SPD_normal', 'EoppD_normal', 'EoddsD_normal', 'GEI', 'Theil']].head(10)

Unnamed: 0,Protected_feature,DI_normal,SPD_normal,EoppD_normal,EoddsD_normal,GEI,Theil
0,Gender_Undefined,2.134275,0.394284,0.267175,0.354783,0.186623,0.262134
1,Country_EE,2.026006,0.249462,0.200009,0.204633,0.186623,0.262134
2,Country_FI,1.341465,0.112141,0.067708,0.084627,0.186623,0.262134
3,Country_ES,2.003973,0.333096,0.20601,0.291904,0.186623,0.262134
4,Age_Group_61-70,1.209859,0.07714,0.042974,0.052802,0.186623,0.262134
5,Education_Basic education,1.978878,0.186363,0.101645,0.131785,0.186623,0.262134
6,Education_Primary education,1.239499,0.08667,0.037398,0.057269,0.186623,0.262134
7,LanguageCode_Estonian,1.988645,0.227974,0.193404,0.186194,0.186623,0.262134
8,LanguageCode_Finnish,1.326737,0.108058,0.063963,0.081521,0.186623,0.262134
9,LanguageCode_Spanish,2.002227,0.332801,0.20582,0.291735,0.186623,0.262134
