In [108]:
import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.model_selection import RepeatedStratifiedKFold

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.preprocessing import StandardScaler

from aif360.sklearn.datasets import standardize_dataset
from aif360.sklearn.metrics import statistical_parity_difference, disparate_impact_ratio,\
                                   equal_opportunity_difference, average_odds_difference, \
                                   generalized_entropy_index, theil_index
from aif360.sklearn.preprocessing import Reweighing
from aif360.sklearn.postprocessing import CalibratedEqualizedOdds, PostProcessingMeta, RejectOptionClassifier
from aif360.sklearn.utils import check_inputs, check_groups



In [72]:
loandata = pd.read_csv('../LoanData_prepared.csv')

In [73]:
df = pd.DataFrame(loandata)
df.shape

(241909, 21)

In [83]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 241909 entries, 0 to 241908
Data columns (total 61 columns):
 #   Column                                                         Non-Null Count   Dtype  
---  ------                                                         --------------   -----  
 0   AppliedAmount                                                  241909 non-null  float64
 1   DebtToIncome                                                   241909 non-null  float64
 2   AmountOfPreviousLoansBeforeLoan                                241909 non-null  float64
 3   ExistingLiabilities                                            241909 non-null  int64  
 4   FreeCash                                                       241909 non-null  float64
 5   IncomeTotal                                                    241909 non-null  float64
 6   Interest                                                       241909 non-null  float64
 7   LiabilitiesTotal                               

In [76]:
df.isnull().sum()

AppliedAmount                        0
DebtToIncome                         0
AmountOfPreviousLoansBeforeLoan      0
Country                              0
Education                            0
EmploymentDurationCurrentEmployer    0
ExistingLiabilities                  0
FreeCash                             0
Gender                               0
HomeOwnershipType                    0
IncomeTotal                          0
Interest                             0
LiabilitiesTotal                     0
LoanDuration                         0
MonthlyPayment                       0
NewCreditCustomer                    0
NoOfPreviousLoansBeforeLoan          0
VerificationType                     0
LanguageCode                         0
Default                              0
Age_Group                            0
dtype: int64

In [115]:
protected = ['Education_Basic education', 'Education_Primary education']

In [6]:
priv_group_dict = {'Education_Basic education':1,
                   'Education_Primary education':0}

# Fair Model Training

In [111]:
df = pd.get_dummies(df)



X, y = standardize_dataset(df, prot_attr=protected, target='Default')

#Initialize Reweighing
RW = Reweighing(prot_attr=protected)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)

# Do the reweighing on the data
# Sample weights are saved in new variable sample_weights!
X_train, sample_weights = RW.fit_transform(X_train, y_train)

scaler=StandardScaler().set_output(transform='pandas')

# Select numerical features from training data
numerical_features = X_train.select_dtypes(include=np.number).columns.tolist()

# Fit scaler to training data
scaler.fit(X_train[numerical_features])

# Transform training and testing data using scaler
X_train = scaler.transform(X_train[numerical_features])
X_test = scaler.transform(X_test[numerical_features])

In [109]:
check_inputs(X_train, y_train, sample_weights, ensure_2d=True)

(                                                                                                                                                                AppliedAmount  \
 Education_Basic education Education_Primary education Education_Higher education Education_Secondary education Education_Vocational education Gender_Undefined                  
 False                     False                       False                      False                         True                           False                  0.710542   
                                                       True                       False                         False                          False                  1.193435   
                                                                                                                                               True                   0.229462   
                           True                        False                      False                       

In [119]:
check_groups(X_train, protected, ensure_binary=True)

ValueError: Expected 2 protected attribute groups, got 3

## See if Reweighing cleared the bias on the dataset

In [9]:
for p in protected:
      print('Attribute', p)
      print('Statistical Parity Difference (SPD):\t\t\t%.4f' %\
            statistical_parity_difference(y_train, prot_attr=p,  priv_group=priv_group_dict[p], sample_weight=sample_weights))
      print('Disparate Impact (DI):\t\t\t\t\t%.4f' %\
            disparate_impact_ratio(y_train, prot_attr=p, priv_group=priv_group_dict[p], sample_weight=sample_weights))


Attribute Education_Basic education
Statistical Parity Difference (SPD):			-0.0000
Disparate Impact (DI):					1.0000
Attribute Education_Primary education
Statistical Parity Difference (SPD):			-0.0000
Disparate Impact (DI):					1.0000


In [10]:
BAG = BaggingClassifier(n_estimators=50, estimator=RandomForestClassifier(n_estimators=25, max_depth=35,random_state=42), random_state=42, warm_start=True, verbose=2)

BAG.fit(X_train, y_train, sample_weight=sample_weights)
print("Out: ",accuracy_score(y_test, BAG.predict(X_test)))
print("In: ",accuracy_score(y_train, BAG.predict(X_train)))


Building estimator 1 of 50 for this parallel run (total 50)...
Building estimator 2 of 50 for this parallel run (total 50)...
Building estimator 3 of 50 for this parallel run (total 50)...
Building estimator 4 of 50 for this parallel run (total 50)...
Building estimator 5 of 50 for this parallel run (total 50)...
Building estimator 6 of 50 for this parallel run (total 50)...
Building estimator 7 of 50 for this parallel run (total 50)...
Building estimator 8 of 50 for this parallel run (total 50)...
Building estimator 9 of 50 for this parallel run (total 50)...
Building estimator 10 of 50 for this parallel run (total 50)...
Building estimator 11 of 50 for this parallel run (total 50)...
Building estimator 12 of 50 for this parallel run (total 50)...
Building estimator 13 of 50 for this parallel run (total 50)...
Building estimator 14 of 50 for this parallel run (total 50)...
Building estimator 15 of 50 for this parallel run (total 50)...
Building estimator 16 of 50 for this parallel run

In [11]:
y_pred = BAG.predict(X_test)

In [12]:
# Generate a classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)

Classification Report:
               precision    recall  f1-score   support

           0       0.69      0.75      0.72     14033
           1       0.61      0.54      0.57     10158

    accuracy                           0.66     24191
   macro avg       0.65      0.64      0.64     24191
weighted avg       0.65      0.66      0.66     24191



In [13]:
y_test_np = y_test.to_numpy()

b = np.array([y_pred[i] - y_test_np[i] + 1 for i in range(len(y_pred) - 1)])

print(b)

[1 1 2 ... 1 0 1]


In [14]:
SPD = []
DI = []
EqualOpp = []
AverageOdds = []
GEI = []
Theil = []

for attr in protected:
    print(attr)
    spd_score = statistical_parity_difference(y_test, y_pred, prot_attr=attr, priv_group=priv_group_dict[attr])
    SPD.append(spd_score)
    di_score = disparate_impact_ratio(y_test, y_pred, prot_attr=attr, priv_group=priv_group_dict[attr])
    DI.append(di_score)
    equalopp = equal_opportunity_difference(y_test, y_pred, prot_attr=attr, priv_group=priv_group_dict[attr])
    EqualOpp.append(equalopp)
    averageodd = average_odds_difference(y_test, y_pred, prot_attr=attr, priv_group=priv_group_dict[attr])
    AverageOdds.append(averageodd)
    gei = generalized_entropy_index(b=b, alpha=2)
    GEI.append(gei)
    theil = generalized_entropy_index(b=b, alpha=1)
    Theil.append(theil)


df_bias = pd.DataFrame({'Protected_feature':protected,'Statistical_Parity':SPD,'Disparate_Impact':DI, 'Equal Opportunity difference':EqualOpp, 'Equalized Odds difference': AverageOdds, 'GEI':GEI, 'Theil':Theil})
df_bias['DI_normal']=df_bias["Disparate_Impact"].apply(lambda x: 1/x if x < 1 else x)
df_bias['SPD_normal']=df_bias["Statistical_Parity"].apply(lambda x: abs(x) if x < 0 else x)
df_bias['EoppD_normal']=df_bias["Equal Opportunity difference"].apply(lambda x: abs(x) if x < 0 else x)
df_bias['EoddsD_normal']=df_bias["Equalized Odds difference"].apply(lambda x: abs(x) if x < 0 else x)

Education_Basic education
Education_Primary education


In [15]:
df_bias.loc[:,['Protected_feature', 'DI_normal', 'SPD_normal', 'EoppD_normal', 'EoddsD_normal', 'GEI', 'Theil']].head(10)

Unnamed: 0,Protected_feature,DI_normal,SPD_normal,EoppD_normal,EoddsD_normal,GEI,Theil
0,Education_Basic education,1.892004,0.176685,0.071071,0.115803,0.187342,0.263256
1,Education_Primary education,1.235711,0.084931,0.034854,0.055623,0.187342,0.263256


## Initialize model

In [16]:
type(X_train)

pandas.core.frame.DataFrame

In [57]:
ROC = RejectOptionClassifier(prot_attr=protected, threshold=0.1, margin=0.1)

postproc = PostProcessingMeta(estimator=BaggingClassifier(n_estimators=5, estimator=RandomForestClassifier(n_estimators=25, max_depth=35,random_state=42), random_state=42, warm_start=True, verbose=2), 
                              postprocessor=ROC, random_state=42, val_size=0.1)

postproc.fit(X_train, y_train)
print("Out: ",accuracy_score(y_test, postproc.predict(X_test)))
print("In: ",accuracy_score(y_train, postproc.predict(X_train)))


Building estimator 1 of 5 for this parallel run (total 5)...
Building estimator 2 of 5 for this parallel run (total 5)...
Building estimator 3 of 5 for this parallel run (total 5)...
Building estimator 4 of 5 for this parallel run (total 5)...
Building estimator 5 of 5 for this parallel run (total 5)...


ValueError: Expected 2 protected attribute groups, got 3

In [82]:
ROC = RejectOptionClassifier(prot_attr=['Education_Basic education'], threshold=0.1, margin=0.1)

ROC.fit(X_train, y_train)
print("Out: ",accuracy_score(y_test, ROC.predict(X_test)))
print("In: ",accuracy_score(y_train, ROC.predict(X_train)))

ValueError: X should contain one column per class. Got: 12 columns.

In [106]:
def check_groups(arr, prot_attr, ensure_binary=False):
    """Get groups from the index of arr.

    If there are multiple protected attributes provided, the index is flattened
    to be a 1-D Index of tuples. If ensure_binary is ``True``, raises a
    ValueError if there are not exactly two unique groups. Also checks that all
    provided protected attributes are in the index.

    Args:
        arr (array-like): Either a Pandas object containing protected attribute
            information in the index or array-like with explicit protected
            attribute array(s) for `prot_attr`.
        prot_attr (label or array-like or list of labels/arrays): Protected
            attribute(s). If contains labels, arr must include these in its
            index. If ``None``, all protected attributes in ``arr.index`` are
            used. Can also be 1D array-like of the same length as arr or a
            list of a combination of such arrays and labels in which case, arr
            may not necessarily be a Pandas type.
        ensure_binary (bool): Raise an error if the resultant groups are not
            binary.

    Returns:
        tuple:

            * **groups** (:class:`pandas.Index`) -- Label (or tuple of labels)
              of protected attribute for each sample in arr.
            * **prot_attr** (`FrozenList`) -- Modified input. If input is a
              single label, returns single-item list. If input is ``None``
              returns list of all protected attributes.
    """
    arr_is_pandas = isinstance(arr, (pd.DataFrame, pd.Series))
    if prot_attr is None:  # use all protected attributes provided in arr
        if not arr_is_pandas:
            raise TypeError("Expected `Series` or `DataFrame` for arr, got "
                           f"{type(arr).__name__} instead. Otherwise, pass "
                            "explicit prot_attr array(s).")
        groups = arr.index
    elif arr_is_pandas:
        df = arr.index.to_frame()
        groups = df.set_index(prot_attr).index  # let pandas handle errors
    else:  # arr isn't pandas. might be okay if prot_attr is array-like
        df = pd.DataFrame(index=[None]*len(arr))  # dummy to check lengths match
        try:
            groups = df.set_index(prot_attr).index
        except KeyError as e:
            raise TypeError("arr does not include protected attributes in the "
                            "index. Check if this got dropped or prot_attr is "
                            "formatted incorrectly.") from e
    prot_attr = groups.names
    groups = groups.to_flat_index()
    print(groups)

    n_unique = groups.nunique()
    if ensure_binary and n_unique != 2:
        raise ValueError("Expected 2 protected attribute groups, got "
                        f"{groups.unique() if n_unique > 5 else n_unique}")
                        

    return groups, prot_attr

In [107]:
check_groups(X_train, protected, ensure_binary=True)

Index([(False, False), (False, False), (False, False),  (False, True),
       (False, False), (False, False),  (False, True), (False, False),
       (False, False), (False, False),
       ...
       (False, False), (False, False),  (True, False), (False, False),
       (False, False), (False, False), (False, False), (False, False),
       (False, False),  (False, True)],
      dtype='object', length=217718)


ValueError: Expected 2 protected attribute groups, got 3

In [None]:
y_pred = postproc.predict(X_test)

In [None]:
# Generate a classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)

Classification Report:
               precision    recall  f1-score   support

           0       0.53      0.00      0.00     14033
           1       0.42      1.00      0.59     10158

    accuracy                           0.42     24191
   macro avg       0.47      0.50      0.30     24191
weighted avg       0.48      0.42      0.25     24191



In [None]:
y_test_np = y_test.to_numpy()

b = np.array([y_pred[i] - y_test_np[i] + 1 for i in range(len(y_pred) - 1)])

print(b)

[2 2 2 ... 1 1 2]


In [None]:
SPD = []
DI = []
EqualOpp = []
AverageOdds = []
GEI = []
Theil = []

for attr in protected:
    print(attr)
    spd_score = statistical_parity_difference(y_test, y_pred, prot_attr=attr, priv_group=priv_group_dict[attr])
    SPD.append(spd_score)
    di_score = disparate_impact_ratio(y_test, y_pred, prot_attr=attr, priv_group=priv_group_dict[attr])
    DI.append(di_score)
    equalopp = equal_opportunity_difference(y_test, y_pred, prot_attr=attr, priv_group=priv_group_dict[attr])
    EqualOpp.append(equalopp)
    averageodd = average_odds_difference(y_test, y_pred, prot_attr=attr, priv_group=priv_group_dict[attr])
    AverageOdds.append(averageodd)
    gei = generalized_entropy_index(b=b, alpha=2)
    GEI.append(gei)
    theil = generalized_entropy_index(b=b, alpha=1)
    Theil.append(theil)


Gender_Undefined


In [None]:
df_bias = pd.DataFrame({'Protected_feature':protected,'Statistical_Parity':SPD,'Disparate_Impact':DI, 'Equal Opportunity difference':EqualOpp, 'Equalized Odds difference': AverageOdds, 'GEI':GEI, 'Theil':Theil})
df_bias['DI_normal']=df_bias["Disparate_Impact"].apply(lambda x: 1/x if x < 1 else x)
df_bias['SPD_normal']=df_bias["Statistical_Parity"].apply(lambda x: abs(x) if x < 0 else x)
df_bias['EoppD_normal']=df_bias["Equal Opportunity difference"].apply(lambda x: abs(x) if x < 0 else x)
df_bias['EoddsD_normal']=df_bias["Equalized Odds difference"].apply(lambda x: abs(x) if x < 0 else x)


In [None]:
df_bias.loc[:,['Protected_feature', 'DI_normal', 'SPD_normal', 'EoppD_normal', 'EoddsD_normal', 'GEI', 'Theil']].head(10)

Unnamed: 0,Protected_feature,DI_normal,SPD_normal,EoppD_normal,EoddsD_normal,GEI,Theil
0,Gender_Undefined,1.017507,0.017201,0.016427,0.017637,0.04918,0.052152
