In [10]:
%load_ext autoreload
%autoreload 2
import pandas as pd

from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

from aif360.datasets import StandardDataset
from aif360.metrics import ClassificationMetric
from aif360.algorithms.preprocessing import Reweighing, LFR, DisparateImpactRemover
from aif360.algorithms.preprocessing.optim_preproc import OptimPreproc
from aif360.algorithms.preprocessing.optim_preproc_helpers.opt_tools import OptTools
from aif360.algorithms.inprocessing import PrejudiceRemover
from aif360.algorithms.inprocessing.adversarial_debiasing import AdversarialDebiasing
from aif360.algorithms.inprocessing.exponentiated_gradient_reduction import ExponentiatedGradientReduction
from aif360.algorithms.postprocessing import CalibratedEqOddsPostprocessing, EqOddsPostprocessing, RejectOptionClassification


from ml.DataProcessing import *
from ml.modelExplainer import *
from ml.nearestNeighbor import * 
from fairness.fairnessEvaluation import *
from fairness.biasMitigation import *
from utils.fairnessVisualization import *
from utils.dataProcessing import *
from utils.rainFall import *

from pyecharts.globals import CurrentConfig, NotebookType
CurrentConfig.NOTEBOOK_TYPE= NotebookType.JUPYTER_NOTEBOOK

import shap
from lime import lime_tabular

import matplotlib.pyplot as plt
import os
import pickle
import numpy as np

import tensorflow.compat.v1 as tf 
tf.disable_eager_execution()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
lr_model = pickle.load(open('data/lrModel/lr_predictor.pkl','rb'))
preprocessor = pickle.load(open('data/preprocessData/preprocessing.pkl','rb'))

X_train = pd.read_csv('data/preprocessData/X_train.csv')
y_train = pd.read_csv('data/preprocessData/y_train.csv')

X_test = pd.read_csv('data/preprocessData/X_test.csv')
y_test = pd.read_csv('data/preprocessData/y_test.csv')

train_df = pd.concat([X_train,y_train],axis=1)
test_df = pd.concat([X_test,y_test],axis=1)

In [33]:
dataset_orig_train = StandardDataset(
    df = train_df,
    label_name= 'labels',
    favorable_classes= [1],
    protected_attribute_names = ['gender'],
    # protected_attribute_names = ['race'],
    # protected_attribute_names = ['hispanic_ethnicity'],
    privileged_classes = [np.array([1])]
)

dataset_orig_test = StandardDataset(
    df = test_df,
    label_name= 'labels',
    favorable_classes= [1],
    protected_attribute_names= ['gender'],
    # protected_attribute_names = ['race'],
    # protected_attribute_names = ['hispanic_ethnicity'],
    privileged_classes = [np.array([1])]
)

unprivileged_groups=[{'gender':0.0}]
privileged_groups=[{'gender':1.0}]

# unprivileged_groups=[{'race':0.0}]
# privileged_groups=[{'race':1.0}]

# unprivileged_groups=[{'hispanic_ethnicity':0.0}]
# privileged_groups=[{'hispanic_ethnicity':1.0}]

In [3]:
def setDatasets(attr):
    dataset_orig_train = StandardDataset(
    df = train_df,
    label_name= 'labels',
    favorable_classes= [1],
    protected_attribute_names = [attr],
    privileged_classes = [np.array([1])]
    )

    dataset_orig_test = StandardDataset(
        df = test_df,
        label_name= 'labels',
        favorable_classes= [1],
        protected_attribute_names= [attr],
        privileged_classes = [np.array([1])]
    )

    unprivileged_groups=[{attr:0.0}]
    privileged_groups=[{attr:1.0}]
    return dataset_orig_train, dataset_orig_test,unprivileged_groups,privileged_groups

In [5]:
dataset_orig_train, dataset_orig_test,unprivileged_groups,privileged_groups = setDatasets('gender')

In [29]:
modelPath = 'data/dtModel/dt_predictor.pkl'
spd,eq_opp_diff,avg_odds_diff,dispImp=getThresholdMetric(loadDataAndModels,modelPath,'gender')

In [None]:
thresholdFairDf_gender_dt=pd.DataFrame({'spd':spd,'eq_opp_diff':eq_opp_diff,'avg_odds_diff':avg_odds_diff,'dispImp':dispImp,'thresh':list(np.linspace(0.01,0.99,100))})
thresholdFairDf_gender_dt.to_csv('fairness/thresholdFairDf_gender_dt.csv',index=False)

In [30]:
thresholdFairDf_race_dt=pd.DataFrame({'spd':spd,'eq_opp_diff':eq_opp_diff,'avg_odds_diff':avg_odds_diff,'dispImp':dispImp,'thresh':list(np.linspace(0.01,0.99,100))})
thresholdFairDf_race_dt.to_csv('fairness/thresholdFairDf_race_dt.csv',index=False)

In [28]:
thresholdFairDf_hisp_dt=pd.DataFrame({'spd':spd,'eq_opp_diff':eq_opp_diff,'avg_odds_diff':avg_odds_diff,'dispImp':dispImp,'thresh':list(np.linspace(0.01,0.99,100))})
thresholdFairDf_hisp_dt.to_csv('fairness/thresholdFairDf_hisp_dt.csv',index=False)

In [26]:
thresholdFairDf_hisp_svc=pd.DataFrame({'spd':spd,'eq_opp_diff':eq_opp_diff,'avg_odds_diff':avg_odds_diff,'dispImp':dispImp,'thresh':list(np.linspace(0.01,0.99,100))})
thresholdFairDf_hisp_svc.to_csv('fairness/thresholdFairDf_hisp_svc.csv',index=False)

In [21]:
thresholdFairDf_gender_svc=pd.DataFrame({'spd':spd,'eq_opp_diff':eq_opp_diff,'avg_odds_diff':avg_odds_diff,'dispImp':dispImp,'thresh':list(np.linspace(0.01,0.99,100))})
thresholdFairDf_gender_svc.to_csv('fairness/thresholdFairDf_gender_svc.csv',index=False)

In [19]:
thresholdFairDf_hisp_lr=pd.DataFrame({'spd':spd,'eq_opp_diff':eq_opp_diff,'avg_odds_diff':avg_odds_diff,'dispImp':dispImp,'thresh':list(np.linspace(0.01,0.99,100))})
thresholdFairDf_hisp_lr.to_csv('fairness/thresholdFairDf_hisp_lr.csv',index=False)

In [15]:
thresholdFairDf_gender_lr=pd.DataFrame({'spd':spd,'eq_opp_diff':eq_opp_diff,'avg_odds_diff':avg_odds_diff,'dispImp':dispImp,'thresh':list(np.linspace(0.01,0.99,100))})
thresholdFairDf_gender_lr.to_csv('fairness/thresholdFairDf_gender_lr.csv',index=False)

In [12]:
spd

[-0.06476287539978953,
 -0.06386529078746517,
 -0.06445891248326518,
 -0.0670984728098466,
 -0.06761789179367184,
 -0.0705189496293217,
 -0.072935901819978,
 -0.07449773480576549,
 -0.07605956779155298,
 -0.07769560348931548,
 -0.07862492540615929,
 -0.08074506674891546,
 -0.07955424732300309,
 -0.07773091682814703,
 -0.07843763060906572,
 -0.0799994635948531,
 -0.0814870938686656,
 -0.08085458279972191,
 -0.08170970200459071,
 -0.08249061849748451,
 -0.0833457377023532,
 -0.08222912098841584,
 -0.08293583476933453,
 -0.08230332370039084,
 -0.08237752641236595,
 -0.08315844290525964,
 -0.08260013454829107,
 -0.08196762347934727,
 -0.08345525375315976,
 -0.08423617024605357,
 -0.08494288402697225,
 -0.08431037295802857,
 -0.08375206460105977,
 -0.08523969487487226,
 -0.08460718380592858,
 -0.08341636438001632,
 -0.08222554495410384,
 -0.08096052281621646,
 -0.08096052281621646,
 -0.08237395037805395,
 -0.08449409172081013,
 -0.08464249714476002,
 -0.08683684119949131,
 -0.08698524662344

In [5]:
def dumpClf(filepath,model):
    with open(filepath+'.pkl','wb') as f:
        pickle.dump(model,f)

Reweighing

In [8]:
### Reweighing Dataset
RW = Reweighing(
    unprivileged_groups=unprivileged_groups,
    privileged_groups=privileged_groups
    )
dataset_transf_train = RW.fit_transform(dataset_orig_train)

new_lr_model = make_pipeline(
    StandardScaler(),
    LogisticRegression(solver='liblinear'),
)
# x_train_tranform = preprocessor.fit_transform(dataset_transf_train.convert_to_dataframe()[0].drop(['labels'],axis=1))
fit_params = {'logisticregression__sample_weight': dataset_transf_train.instance_weights}
new_lr_model.fit(dataset_transf_train.features,dataset_transf_train.labels.ravel(),**fit_params)


Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression(solver='liblinear'))])

In [9]:
y_test_transf_pred = new_lr_model.predict(dataset_orig_test.features)
y_train_transf_pred = new_lr_model.predict(dataset_orig_train.features)

In [10]:
metric_train,metric_test=getClassMetricDataset(dataset_orig_train, dataset_orig_test, unprivileged_groups, privileged_groups, y_train_transf_pred, y_test_transf_pred)

LFR

In [116]:
lfr = LFR(
    unprivileged_groups = unprivileged_groups,
    privileged_groups = privileged_groups,
    k = 5,
    verbose = 1
)

In [117]:
dataset_transf_train = lfr.fit_transform(dataset_orig_train)
new_lr_model = make_pipeline(
    StandardScaler(),
    # LogisticRegression(solver='liblinear'),
    # SVC(kernel='linear', probability=True),
    DecisionTreeClassifier()
)
# fit_params = {'logisticregression__sample_weight': dataset_transf_train.instance_weights}
new_lr_model.fit(dataset_transf_train.features,dataset_transf_train.labels.ravel(),)

step: 0, loss: 1124.3161401655464, L_x: 112343.10494743494,  L_y: 0.7168507419069616,  L_z: 0.003364798985797235
step: 250, loss: 1122.5112715784117, L_x: 112123.9410303256,  L_y: 0.5084123087655004,  L_z: 0.015268979327803517
step: 500, loss: 1117.7348504229622, L_x: 111592.22862782486,  L_y: 0.8806188131229616,  L_z: 0.018638906631806985
step: 750, loss: 1091.6562707440544, L_x: 108928.07079416662,  L_y: 1.464416866326033,  L_z: 0.018222918721244635
step: 1000, loss: 1059.8414540881763, L_x: 105858.74905269602,  L_y: 0.6916082790612426,  L_z: 0.011247105643095449
step: 1250, loss: 1025.5847652637276, L_x: 102399.10589702135,  L_y: 1.0585686385191078,  L_z: 0.010702753099898318
step: 1500, loss: 999.0945572126947, L_x: 99801.46134125628,  L_y: 0.5612783737211087,  L_z: 0.010373308528216154
step: 1750, loss: 990.9995390512275, L_x: 98995.5203101655,  L_y: 0.5664524353774262,  L_z: 0.009557670283900244
step: 2000, loss: 988.3695917246757, L_x: 98712.19478078528,  L_y: 0.9300252225466549

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('decisiontreeclassifier', DecisionTreeClassifier())])

In [118]:
dumpClf('fairness/race_dt_lfr',new_lr_model)

In [23]:
lr_lfr = pickle.load(open('fairness/'+'hispanic_ethnicity'+'_dt_lfr.pkl','rb'))
lr_lfr

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('decisiontreeclassifier', DecisionTreeClassifier())])

Optimized Preprocessing it uses options that you have to configure yourself and I don't really find how to choose it. Also it use an Optimizer and I didn't find how to build this class.

In [76]:
# optim_options = {
#     "distortion_fun": get_distortion_adult,
#     "epsilon": 0.05,
#     "clist": [0.99, 1.99, 2.99],
#     "dlist": [.1, 0.05, 0]
# }
    
# OP = OptimPreproc(OptTools, optim_options)

Adversial Debiasing

In [61]:
tf.reset_default_graph()
sess = tf.Session()

dataset_orig_train, dataset_orig_test,unprivileged_groups,privileged_groups = setDatasets('hispanic_ethnicity')
debiased_model_hisp = AdversarialDebiasing(privileged_groups = privileged_groups,
                          unprivileged_groups = unprivileged_groups,
                          scope_name='debiased_classifier',
                          debias=True,
                          sess=sess)
debiased_model_hisp.fit(dataset_orig_train)

epoch 0; iter: 0; batch classifier loss: 50.908791; batch adversarial loss: 0.619251
epoch 1; iter: 0; batch classifier loss: 2.743416; batch adversarial loss: 0.637023
epoch 2; iter: 0; batch classifier loss: 1.966012; batch adversarial loss: 0.658180
epoch 3; iter: 0; batch classifier loss: 2.001759; batch adversarial loss: 0.631459
epoch 4; iter: 0; batch classifier loss: 3.028960; batch adversarial loss: 0.689343
epoch 5; iter: 0; batch classifier loss: 4.493368; batch adversarial loss: 0.610940
epoch 6; iter: 0; batch classifier loss: 1.060957; batch adversarial loss: 0.655755
epoch 7; iter: 0; batch classifier loss: 1.068834; batch adversarial loss: 0.605621
epoch 8; iter: 0; batch classifier loss: 1.057961; batch adversarial loss: 0.615859
epoch 9; iter: 0; batch classifier loss: 0.662587; batch adversarial loss: 0.639405
epoch 10; iter: 0; batch classifier loss: 1.040421; batch adversarial loss: 0.573744
epoch 11; iter: 0; batch classifier loss: 1.269842; batch adversarial loss

<aif360.algorithms.inprocessing.adversarial_debiasing.AdversarialDebiasing at 0x1d8bd960760>

In [58]:
dataset_debiased_test = debiased_model_gender.predict(dataset_orig_test)
metric_test = ClassificationMetric(
    dataset_orig_test,
    dataset_debiased_test,
    unprivileged_groups=unprivileged_groups,
    privileged_groups=privileged_groups
)
fairRslt_gender = getFairRslt(metric_test)
fairRslt_gender

[-0.017, 0.06, 0.026, 0.877]

In [60]:
dataset_debiased_test = debiased_model_race.predict(dataset_orig_test)
metric_test = ClassificationMetric(
    dataset_orig_test,
    dataset_debiased_test,
    unprivileged_groups=unprivileged_groups,
    privileged_groups=privileged_groups
)
fairRslt_race = getFairRslt(metric_test)
fairRslt_race

[-0.051, -0.072, -0.039, 0.721]

In [62]:
dataset_debiased_test = debiased_model_hisp.predict(dataset_orig_test)
metric_test = ClassificationMetric(
    dataset_orig_test,
    dataset_debiased_test,
    unprivileged_groups=unprivileged_groups,
    privileged_groups=privileged_groups
)
fairRslt_hisp = getFairRslt(metric_test)
fairRslt_hisp

[0.011, 0.099, 0.049, 1.059]

In [65]:
debiased_df = pd.DataFrame({'gender':fairRslt_gender,'race':fairRslt_race,'hispanic_ethnicity':fairRslt_hisp,'metrics':['SPD','EOD','AOD','DI']})

In [68]:
debiased_df.to_csv('fairness/debiased_df.csv',index=False)

Exponentiated Gradient Reduction

In [4]:
model1 =LogisticRegression(solver='liblinear')
model2 = SVC(kernel='linear', probability=True)
model3 = DecisionTreeClassifier()

In [5]:
def get_Exp_grad_red(model,attr,setDatasets):
    dataset_orig_train, dataset_orig_test,unprivileged_groups,privileged_groups = setDatasets(attr)
    np.random.seed(0) #need for reproducibility
    exp_grad_red = ExponentiatedGradientReduction(estimator=model, 
                                                constraints="EqualizedOdds",
                                                drop_prot_attr=False)
    exp_grad_red.fit(dataset_orig_train)
    exp_grad_red_pred = exp_grad_red.predict(dataset_orig_test)
    metric_test = ClassificationMetric(dataset_orig_test, 
                                   exp_grad_red_pred,
                                    unprivileged_groups=unprivileged_groups,
                                    privileged_groups=privileged_groups)
    return metric_test



In [9]:
metric_test=get_Exp_grad_red(model1,'hispanic_ethnicity',setDatasets)
fairRslt_hisp_lr = getFairRslt(metric_test)
fairRslt_hisp_lr

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
Using the level keyword in DataFrame and Series aggregations is deprecated and will be removed in a future version. Use groupby instead. df.sum(level=1) should use df.groupby(level=1).sum().
Using the level keyword in DataFrame and Series aggregations is deprecated and will be removed in a future version. Use groupby instead. df.sum(level=1) should use df.groupby(level=1).sum().
Using the level keyword in DataFrame and Series aggregations is deprecated and will be removed in a future version. Use groupby instead. df.sum(level=1) should use df.groupby(level=1).sum().
Using the level keyword in DataFrame and Series aggregations is deprecated and will be removed in a future version. Use groupby instead. df.sum(level=1) should use df.groupby(level=1).sum().
Using the level keyword in DataFrame and Series aggregations is deprecated and will be removed in a fut

[0.002, 0.075, 0.034, 1.013]

In [10]:
metric_test=get_Exp_grad_red(model1,'race',setDatasets)
fairRslt_race_lr = getFairRslt(metric_test)
fairRslt_race_lr

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
Using the level keyword in DataFrame and Series aggregations is deprecated and will be removed in a future version. Use groupby instead. df.sum(level=1) should use df.groupby(level=1).sum().
Using the level keyword in DataFrame and Series aggregations is deprecated and will be removed in a future version. Use groupby instead. df.sum(level=1) should use df.groupby(level=1).sum().
Using the level keyword in DataFrame and Series aggregations is deprecated and will be removed in a future version. Use groupby instead. df.sum(level=1) should use df.groupby(level=1).sum().
Using the level keyword in DataFrame and Series aggregations is deprecated and will be removed in a future version. Use groupby instead. df.sum(level=1) should use df.groupby(level=1).sum().
Using the level keyword in DataFrame and Series aggregations is deprecated and will be removed in a fut

[-0.085, -0.185, -0.102, 0.55]

In [11]:
metric_test=get_Exp_grad_red(model1,'gender',setDatasets)
fairRslt_gender_lr = getFairRslt(metric_test)
fairRslt_gender_lr = getFairRslt(metric_test)
fairRslt_gender_lr

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
Using the level keyword in DataFrame and Series aggregations is deprecated and will be removed in a future version. Use groupby instead. df.sum(level=1) should use df.groupby(level=1).sum().
Using the level keyword in DataFrame and Series aggregations is deprecated and will be removed in a future version. Use groupby instead. df.sum(level=1) should use df.groupby(level=1).sum().
Using the level keyword in DataFrame and Series aggregations is deprecated and will be removed in a future version. Use groupby instead. df.sum(level=1) should use df.groupby(level=1).sum().
Using the level keyword in DataFrame and Series aggregations is deprecated and will be removed in a future version. Use groupby instead. df.sum(level=1) should use df.groupby(level=1).sum().
Using the level keyword in DataFrame and Series aggregations is deprecated and will be removed in a fut

[-0.033, 0.007, -0.001, 0.813]

In [12]:
exp_grad_red_lr = pd.DataFrame({'gender':fairRslt_gender_lr,'race':fairRslt_race_lr,'hispanic_ethnicity':fairRslt_hisp_lr,'metrics':['SPD','EOD','AOD','DI']})
exp_grad_red_lr.to_csv('fairness/exp_grad_red_lr.csv',index=False)

In [13]:
metric_test=get_Exp_grad_red(model2,'gender',setDatasets)
fairRslt_gender_svc = getFairRslt(metric_test)
fairRslt_gender_svc

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
Using the level keyword in DataFrame and Series aggregations is deprecated and will be removed in a future version. Use groupby instead. df.sum(level=1) should use df.groupby(level=1).sum().


In [9]:
metric_test=get_Exp_grad_red(model3,'race',setDatasets)
fairRslt_race_dt = getFairRslt(metric_test)
fairRslt_race_dt

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
Using the level keyword in DataFrame and Series aggregations is deprecated and will be removed in a future version. Use groupby instead. df.sum(level=1) should use df.groupby(level=1).sum().
Using the level keyword in DataFrame and Series aggregations is deprecated and will be removed in a future version. Use groupby instead. df.sum(level=1) should use df.groupby(level=1).sum().
Using the level keyword in DataFrame and Series aggregations is deprecated and will be removed in a future version. Use groupby instead. df.sum(level=1) should use df.groupby(level=1).sum().
Using the level keyword in DataFrame and Series aggregations is deprecated and will be removed in a future version. Use groupby instead. df.sum(level=1) should use df.groupby(level=1).sum().
Using the level keyword in DataFrame and Series aggregations is deprecated and will be removed in a fut

[-0.046, -0.001, -0.001, 0.804]

In [10]:
metric_test=get_Exp_grad_red(model3,'hispanic_ethnicity',setDatasets)
fairRslt_hisp_dt = getFairRslt(metric_test)
fairRslt_hisp_dt

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
Using the level keyword in DataFrame and Series aggregations is deprecated and will be removed in a future version. Use groupby instead. df.sum(level=1) should use df.groupby(level=1).sum().
Using the level keyword in DataFrame and Series aggregations is deprecated and will be removed in a future version. Use groupby instead. df.sum(level=1) should use df.groupby(level=1).sum().
Using the level keyword in DataFrame and Series aggregations is deprecated and will be removed in a future version. Use groupby instead. df.sum(level=1) should use df.groupby(level=1).sum().
Using the level keyword in DataFrame and Series aggregations is deprecated and will be removed in a future version. Use groupby instead. df.sum(level=1) should use df.groupby(level=1).sum().
Using the level keyword in DataFrame and Series aggregations is deprecated and will be removed in a fut

[-0.006, 0.038, 0.018, 0.973]

In [12]:
exp_grad_red_dt = pd.DataFrame({'gender':fairRslt_gender_svc,'race':fairRslt_race_dt,'hispanic_ethnicity':fairRslt_hisp_dt,'metrics':['SPD','EOD','AOD','DI']})
exp_grad_red_dt.to_csv('fairness/exp_grad_red_dt.csv',index=False)

Reject Option Classification

In [133]:
dataset_orig_train, dataset_orig_test,unprivileged_groups,privileged_groups = setDatasets('gender')
ROC = RejectOptionClassification(
    privileged_groups = privileged_groups,
    unprivileged_groups = unprivileged_groups
)

In [134]:
dt_model = pickle.load(open('data/dtModel/dt_predictor.pkl','rb'))['classification']
lr_model = pickle.load(open('data/lrModel/lr_predictor.pkl','rb'))['classification']
svc_model = pickle.load(open('data/svcModel/svc_predictor.pkl','rb'))['classification']

In [135]:
y_train_pred_lr, y_test_pred_lr = getPredictions(dataset_orig_train.features,dataset_orig_test.features,lr_model,threshold=None)
y_train_pred_dt, y_test_pred_dt = getPredictions(dataset_orig_train.features,dataset_orig_test.features,dt_model,threshold=None)
y_train_pred_svc, y_test_pred_svc = getPredictions(dataset_orig_train.features, dataset_orig_test.features,svc_model,threshold=None)

In [136]:
dataset_orig_pred_train_lr = dataset_orig_train.copy(deepcopy=True)
dataset_orig_pred_train_lr.labels = y_train_pred_lr.reshape(-1,1)

dataset_orig_pred_test_lr = dataset_orig_test.copy(deepcopy=True)
dataset_orig_pred_test_lr.labels = y_test_pred_lr.reshape(-1,1)

dataset_orig_pred_train_svc = dataset_orig_train.copy(deepcopy=True)
dataset_orig_pred_train_svc.labels = y_train_pred_svc.reshape(-1,1)

dataset_orig_pred_test_svc = dataset_orig_test.copy(deepcopy=True)
dataset_orig_pred_test_svc.labels = y_test_pred_svc.reshape(-1,1)

dataset_orig_pred_train_dt = dataset_orig_train.copy(deepcopy=True)
dataset_orig_pred_train_dt.labels = y_train_pred_dt.reshape(-1,1)

dataset_orig_pred_test_dt = dataset_orig_test.copy(deepcopy=True)
dataset_orig_pred_test_dt.labels = y_test_pred_dt.reshape(-1,1)

In [137]:
ROC_lr = ROC.fit(dataset_orig_test,dataset_orig_pred_test_lr)
ROC_svc = ROC.fit(dataset_orig_test,dataset_orig_pred_test_svc)
ROC_dt = ROC.fit(dataset_orig_test,dataset_orig_pred_test_dt)

In [138]:
dataset_transf_test_lr = ROC_lr.predict(dataset_orig_pred_test_lr)
dataset_transf_test_svc = ROC_svc.predict(dataset_orig_pred_test_svc)
dataset_transf_test_dt = ROC_dt.predict(dataset_orig_pred_test_dt)

In [139]:
test_metric_lr  = ClassificationMetric(
    dataset_orig_test,
    dataset_transf_test_lr,
    unprivileged_groups=unprivileged_groups,
    privileged_groups=privileged_groups
)


test_metric_svc  = ClassificationMetric(
    dataset_orig_test,
    dataset_transf_test_svc,
    unprivileged_groups=unprivileged_groups,
    privileged_groups=privileged_groups
)

test_metric_dt  = ClassificationMetric(
    dataset_orig_test,
    dataset_transf_test_dt,
    unprivileged_groups=unprivileged_groups,
    privileged_groups=privileged_groups
)

In [124]:
hisp_roc_lr=getFairRslt(test_metric_lr)
hisp_roc_svc=getFairRslt(test_metric_svc)
hisp_roc_dt = getFairRslt(test_metric_dt)

In [132]:
race_roc_lr=getFairRslt(test_metric_lr)
race_roc_svc=getFairRslt(test_metric_svc)
race_roc_dt = getFairRslt(test_metric_dt)

In [140]:
gender_roc_lr=getFairRslt(test_metric_lr)
gender_roc_svc=getFairRslt(test_metric_svc)
gender_roc_dt = getFairRslt(test_metric_dt)

In [141]:
roc_lr_df = pd.DataFrame({'gender':gender_roc_lr,'race':race_roc_lr,'hispanic_ethnicity':hisp_roc_lr,'metrics':['SPD','EOD','AOD','DI']})
roc_dt_df = pd.DataFrame({'gender':gender_roc_dt,'race':race_roc_dt,'hispanic_ethnicity':hisp_roc_dt,'metrics':['SPD','EOD','AOD','DI']})
roc_svc_df = pd.DataFrame({'gender':gender_roc_svc,'race':race_roc_svc,'hispanic_ethnicity':hisp_roc_svc,'metrics':['SPD','EOD','AOD','DI']})

In [142]:
roc_lr_df.to_csv('fairness/roc_lr_df.csv',index=False)
roc_dt_df.to_csv('fairness/roc_dt_df.csv',index=False)
roc_svc_df.to_csv('fairness/roc_svc_df.csv',index=False)

Calibrated Equal Odds PostProcessing

In [108]:
dataset_orig_train, dataset_orig_test,unprivileged_groups,privileged_groups = setDatasets('hispanic_ethnicity')
cost_constraint = 'fnr'
CPP = CalibratedEqOddsPostprocessing(privileged_groups = privileged_groups,
                                     unprivileged_groups = unprivileged_groups,
                                     cost_constraint=cost_constraint,
                                     seed=42)

In [109]:
dt_model = pickle.load(open('data/dtModel/dt_predictor.pkl','rb'))['classification']
lr_model = pickle.load(open('data/lrModel/lr_predictor.pkl','rb'))['classification']
svc_model = pickle.load(open('data/svcModel/svc_predictor.pkl','rb'))['classification']
y_train_pred_lr, y_test_pred_lr = getPredictions(dataset_orig_train.features,dataset_orig_test.features,lr_model,threshold=None)
y_train_pred_dt, y_test_pred_dt = getPredictions(dataset_orig_train.features,dataset_orig_test.features,dt_model,threshold=None)
y_train_pred_svc, y_test_pred_svc = getPredictions(dataset_orig_train.features, dataset_orig_test.features,svc_model,threshold=None)

In [110]:
dataset_orig_pred_train_lr = dataset_orig_train.copy(deepcopy=True)
dataset_orig_pred_train_lr.labels = y_train_pred_lr.reshape(-1,1)

dataset_orig_pred_test_lr = dataset_orig_test.copy(deepcopy=True)
dataset_orig_pred_test_lr.labels = y_test_pred_lr.reshape(-1,1)

dataset_orig_pred_train_svc = dataset_orig_train.copy(deepcopy=True)
dataset_orig_pred_train_svc.labels = y_train_pred_svc.reshape(-1,1)

dataset_orig_pred_test_svc = dataset_orig_test.copy(deepcopy=True)
dataset_orig_pred_test_svc.labels = y_test_pred_svc.reshape(-1,1)

dataset_orig_pred_train_dt = dataset_orig_train.copy(deepcopy=True)
dataset_orig_pred_train_dt.labels = y_train_pred_dt.reshape(-1,1)

dataset_orig_pred_test_dt = dataset_orig_test.copy(deepcopy=True)
dataset_orig_pred_test_dt.labels = y_test_pred_dt.reshape(-1,1)

In [111]:
CPP_lr = CPP.fit(dataset_orig_test,dataset_orig_pred_test_lr)
CPP_svc = CPP.fit(dataset_orig_test,dataset_orig_pred_test_svc)
CPP_dt = CPP.fit(dataset_orig_test,dataset_orig_pred_test_dt)

dataset_transf_test_lr = CPP_lr.predict(dataset_orig_pred_test_lr)
dataset_transf_test_svc = CPP_svc.predict(dataset_orig_pred_test_svc)
dataset_transf_test_dt = CPP_dt.predict(dataset_orig_pred_test_dt)

In [112]:
test_metric_lr  = ClassificationMetric(
    dataset_orig_test,
    dataset_transf_test_lr,
    unprivileged_groups=unprivileged_groups,
    privileged_groups=privileged_groups
)


test_metric_svc  = ClassificationMetric(
    dataset_orig_test,
    dataset_transf_test_svc,
    unprivileged_groups=unprivileged_groups,
    privileged_groups=privileged_groups
)

test_metric_dt  = ClassificationMetric(
    dataset_orig_test,
    dataset_transf_test_dt,
    unprivileged_groups=unprivileged_groups,
    privileged_groups=privileged_groups
)

In [113]:
hisp_cpp_lr=getFairRslt(test_metric_lr)
hisp_cpp_svc=getFairRslt(test_metric_svc)
hisp_cpp_dt = getFairRslt(test_metric_dt)

In [107]:
race_cpp_lr=getFairRslt(test_metric_lr)
race_cpp_svc=getFairRslt(test_metric_svc)
race_cpp_dt = getFairRslt(test_metric_dt)

In [101]:
gender_cpp_lr=getFairRslt(test_metric_lr)
gender_cpp_svc=getFairRslt(test_metric_svc)
gender_cpp_dt = getFairRslt(test_metric_dt)

In [114]:
cpp_lr_df = pd.DataFrame({'gender':gender_cpp_lr,'race':race_cpp_lr,'hispanic_ethnicity':hisp_cpp_lr,'metrics':['SPD','EOD','AOD','DI']})
cpp_dt_df = pd.DataFrame({'gender':gender_cpp_dt,'race':race_cpp_dt,'hispanic_ethnicity':hisp_cpp_dt,'metrics':['SPD','EOD','AOD','DI']})
cpp_svc_df = pd.DataFrame({'gender':gender_cpp_svc,'race':race_cpp_svc,'hispanic_ethnicity':hisp_cpp_svc,'metrics':['SPD','EOD','AOD','DI']})

In [115]:
cpp_lr_df.to_csv('fairness/cpp_lr_df.csv',index=False)
cpp_dt_df.to_csv('fairness/cpp_dt_df.csv',index=False)
cpp_svc_df.to_csv('fairness/cpp_svc_df.csv',index=False)

In [149]:
pd.read_csv('fairness/cpp_svc_df.csv')

Unnamed: 0,gender,race,hispanic_ethnicity,metrics
0,-0.043,-0.051,-0.015,SPD
1,0.0,0.0,0.0,EOD
2,0.0,0.0,0.0,AOD
3,0.824,0.79,0.934,DI


In [150]:
pd.read_csv('fairness/roc_svc_df.csv')

Unnamed: 0,gender,race,hispanic_ethnicity,metrics
0,-0.043,-0.051,-0.015,SPD
1,0.0,0.0,0.0,EOD
2,0.0,0.0,0.0,AOD
3,0.824,0.79,0.934,DI


Disparate impact remover

In [34]:
DIR = DisparateImpactRemover()
data_transf_train = DIR.fit_transform(dataset_orig_train)

In [35]:
metric_train, metric_test = disparateImpactRemover(unprivileged_groups,privileged_groups,dataset_orig_train,dataset_orig_test, 'Decision Tree',repair_level=1.0)
fairRslt = getFairRslt(metric_test)
fairRslt

[-0.055, 0.003, -0.01, 0.786]

In [29]:
y_test_transf_pred = lr_lfr.predict(dataset_orig_test.features)
y_train_transf_pred = lr_lfr.predict(dataset_orig_train.features)
metric_train,metric_test=getClassMetricDataset(dataset_orig_train, dataset_orig_test, unprivileged_groups, privileged_groups, y_train_transf_pred, y_test_transf_pred)

In [36]:
_,metric_test_di_dt = disparateImpactRemover(unprivileged_groups,privileged_groups,dataset_orig_train,dataset_orig_test,'Decision Tree',repair_level=1.0)
fairRslt_di_dt =getFairRslt(metric_test_di_dt)

gaugeSPD_di_dt=getFairGauge(fairRslt_di_dt[0],-1,1)
gaugeEOD_di_dt=getFairGauge(fairRslt_di_dt[1],-1,1)
gaugeAOD_di_dt=getFairGauge(fairRslt_di_dt[2],-1,1)
gaugeDI_di_dt=getFairGauge(fairRslt_di_dt[3],0,2,flow=0.4,fhigh=0.63)

In [37]:
fairRslt_di_dt

[-0.057, -0.005, -0.014, 0.782]