In [1]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import sklearn
import os
from pandas import Series, DataFrame
from pylab import rcParams
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics 
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.impute import SimpleImputer 
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import BayesianRidge
from autoimpute.imputations import SingleImputer
from autoimpute.imputations import MultipleImputer
from numpy import nan
from numpy import isnan
from aif360.metrics import BinaryLabelDatasetMetric
from aif360.metrics import ClassificationMetric
from aif360.datasets import StandardDataset
from aif360.datasets import BinaryLabelDataset
from aif360.explainers import MetricTextExplainer
from aif360.metrics import Metric, DatasetMetric, utils
from sklearn import preprocessing
from typing import List, Union, Dict
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
import warnings
warnings.filterwarnings('ignore')
from multiprocessing import Pool
from functools import partial
import multiprocessing
import compas_loop_script
from compas_loop_script import nested_loop
from compas_loop_script import nested_loop2



In [None]:
from aif360.datasets import CompasDataset
from aif360.algorithms.preprocessing.optim_preproc_helpers.data_preproc_functions import load_preproc_data_compas
compas = load_preproc_data_compas(["sex"])

In [2]:
Compasdata = pd.read_csv('compas_data.csv')
Compasdata["sex"] = Compasdata["sex"].astype('category')
Compasdata["race_class"] = Compasdata["race_class"].astype('category')
Compasdata_Numerical = Compasdata.select_dtypes(exclude = 'object')

In [3]:
categorical_features = ['race_class', 'sex']
data_encoded = Compasdata_Numerical.copy()

categorical_names = {}
encoders = {}

# Use Label Encoder for categorical columns (including target column)
for feature in categorical_features:
    le = LabelEncoder()
    le.fit(data_encoded[feature])
    
    data_encoded[feature] = le.transform(data_encoded[feature])
    
    categorical_names[feature] = le.classes_
    encoders[feature] = le
data_perp_sex = data_encoded

In [4]:
privileged_race = np.where(categorical_names['race_class'] == 'White')[0]
privileged_sex = np.where(categorical_names['sex'] == 'Female')[0]
data_standard = StandardDataset(data_perp_sex, 
                               label_name='Reoffending_Likelihood', 
                               favorable_classes=[1], 
                               protected_attribute_names=['race_class', 'sex'], 
                               privileged_classes=[privileged_race, privileged_sex])
#There shouldn't be any missing data while converting to standard dataset

In [6]:
def fair_metrics(dataset, pred, pred_is_dataset=False):
    if pred_is_dataset:
        dataset_pred = pred
    else:
        dataset_pred = dataset.copy()
        dataset_pred.labels = pred
    
    cols = ['statistical_parity_difference', 'equal_opportunity_difference', 'average_abs_odds_difference',  'disparate_impact', 'theil_index']
    obj_fairness = [[0,0,0,1,0]]
    
    fair_metrics = pd.DataFrame(data=obj_fairness, index=['objective'], columns=cols)
    
    for attr in dataset_pred.protected_attribute_names:
        idx = dataset_pred.protected_attribute_names.index(attr)
        privileged_groups =  [{attr:dataset_pred.privileged_protected_attributes[idx][0]}] 
        unprivileged_groups = [{attr:dataset_pred.unprivileged_protected_attributes[idx][0]}] 
        
        classified_metric = ClassificationMetric(dataset, 
                                                     dataset_pred,
                                                     unprivileged_groups=unprivileged_groups,
                                                     privileged_groups=privileged_groups)

        metric_pred = BinaryLabelDatasetMetric(dataset_pred,
                                                     unprivileged_groups=unprivileged_groups,
                                                     privileged_groups=privileged_groups)

        acc = classified_metric.accuracy()

        row = pd.DataFrame([[metric_pred.mean_difference(),
                                classified_metric.equal_opportunity_difference(),
                                classified_metric.average_abs_odds_difference(),
                                metric_pred.disparate_impact(),
                                classified_metric.theil_index()]],
                           columns  = cols,
                           index = [attr]
                          )
        fair_metrics = fair_metrics.append(row)    
    
    fair_metrics = fair_metrics.replace([-np.inf, np.inf], 2)
        
    return fair_metrics

In [7]:
def get_fair_metrics(data, model, plot=True, model_aif=False):
    pred = model.predict(data).labels if model_aif else model.predict(data.features)
    fair = fair_metrics(data, pred)
    return fair

In [8]:
# Nested For Loop, Calculating the Fairness Metrics before Imputation on the Original Dataset
# We are using three classification Models to calculate the fairness metrics and store the results in a dataframe
np.random.seed(123)
data_orig_train, data_orig_test = data_standard.split([0.7], shuffle=True)
LR_Classifier = LogisticRegression()
RF_Classifier = RandomForestClassifier()
SV_Classifier = LinearSVC(max_iter=1000)
SV_Classifier = CalibratedClassifierCV(SV_Classifier)
Classification_Models = [LR_Classifier, RF_Classifier, SV_Classifier]
results_pre_names = ['dataset_name', 'num_columns_imputed', 'percentage_deleted', 'imputation_strategy', 'repetition', 
                    'classification_algorithm', 'accuracy', 'auc', 'F1_score', 'Sensitivity', 'Specificity', 
                    'Statistical_Parity_Race', 'Statistical_Parity_Sex', 'Equal_Oppr_diff_Race', 'Equal_Oppr_diff_Sex', 
                    'average_abs_odds_diff_Race', 'average_abs_odds_diff_Sex', 'Disparate_Impact_Race', 'Disparate_Impact_Sex', 'Theil_Index_Race', 'Theil_Index_Sex']
results_pre = pd.DataFrame(columns=results_pre_names)

num_repetitions = 100
for i in range(num_repetitions):
    for classifier in Classification_Models:
        log_reg_fit = classifier.fit(data_orig_train.features, 
                         data_orig_train.labels.ravel(), 
                         sample_weight=data_orig_train.instance_weights)
        X_test = data_orig_test.features
        y_test = data_orig_test.labels.ravel()
        y_pred = log_reg_fit.predict(X_test)
        Accuracy = metrics.accuracy_score(y_test, y_pred)

        y_pred_proba_final = log_reg_fit.predict_proba(X_test)[:,1]
        AUC = roc_auc_score(y_test, y_pred_proba_final)

        F1_score = metrics.f1_score(y_test, y_pred)
        confusion_matrix_pre = metrics.confusion_matrix(y_test, y_pred)
        TP = confusion_matrix_pre[1, 1]
        TN = confusion_matrix_pre[0, 0]
        FP = confusion_matrix_pre[0, 1]
        FN = confusion_matrix_pre[1, 0]
        Sensitivity = TP / (TP + FN) 
        Specificity = TN / (TN + FP)
        fair_final = get_fair_metrics(data_orig_test, log_reg_fit)   
        Statistical_Parity_Race = fair_final.iloc[1,0]
        Statistical_Parity_Sex = fair_final.iloc[2,0]
        Equal_Oppr_diff_Race = fair_final.iloc[1,1]
        Equal_Oppr_diff_Sex = fair_final.iloc[2,1]
        average_abs_odds_diff_Race = fair_final.iloc[1,2]
        average_abs_odds_diff_Sex = fair_final.iloc[2,2]
        Disparate_Impact_Race = fair_final.iloc[1,3]
        Disparate_Impact_Sex = fair_final.iloc[2,3]
        Theil_Index_Race = fair_final.iloc[1,4]
        Theil_Index_Sex = fair_final.iloc[2,4]
        new_row_pre = ['Compas_Pre', 0,0,'None',i, classifier.__class__.__name__, Accuracy, AUC, F1_score, Sensitivity, Specificity, Statistical_Parity_Race, 
                       Statistical_Parity_Sex, Equal_Oppr_diff_Race, Equal_Oppr_diff_Sex, average_abs_odds_diff_Race, average_abs_odds_diff_Sex, Disparate_Impact_Race, 
                       Disparate_Impact_Sex, Theil_Index_Race, Theil_Index_Sex]
        results_pre.loc[len(results_pre)] = new_row_pre

In [9]:
cols = list(data_orig_train.feature_names)
local_data_train = pd.DataFrame(data= data_orig_train.features, columns= cols)
local_data_test = pd.DataFrame(data= data_orig_test.features, columns= cols)
local_data_train['Reoffending_Likelihood'] = pd.DataFrame(data= data_orig_train.labels)
local_data_test['Reoffending_Likelihood'] = pd.DataFrame(data= data_orig_test.labels)

In [17]:
cols = list(data_orig_train.feature_names)
Protected_Variables = ["sex", "Reoffending_Likelihood", "race_class"]
cols1 = list(set(cols) - set(Protected_Variables))
col2 = local_data_train.columns

# Defining all the imputations that we are using on Numerical variables
Simple_Mean = SimpleImputer(missing_values= nan, strategy='mean')
knn = KNNImputer(n_neighbors=2)
#Most_Freq = SimpleImputer(missing_values= nan, strategy="most_frequent" )
Simple_Median = SimpleImputer(missing_values= nan, strategy='median')
Iterative = IterativeImputer(BayesianRidge())
linear_Auto = SingleImputer(strategy="interpolate")
lm_ft = SingleImputer(strategy="least squares")
stoch_ft = SingleImputer(strategy="stochastic")
# pmm_ft = SingleImputer(strategy="pmm")
norm_ft = SingleImputer(strategy="norm")

imputation_types = [Simple_Mean, knn, Simple_Median, Iterative, linear_Auto, lm_ft, stoch_ft, norm_ft]
imputation_type2 = [linear_Auto, lm_ft, stoch_ft, norm_ft]

LR_Classifier = LogisticRegression()
RF_Classifier = RandomForestClassifier()
SV_Classifier = LinearSVC(max_iter=1000)
SV_Classifier = CalibratedClassifierCV(SV_Classifier)

Classification_Models = [LR_Classifier, RF_Classifier, SV_Classifier]

percentage_list = [0.05,0.01,0.1]

# results_df_names = 'dataset_name, num_columns_imputed, percentage_deleted, imputation_strategy, repetition, classification_algorithm,mitigation_strategy(optional),accuracy, auc, sensitivity, specificity, kappa, statistical_parity, equalized_odds, ...'
results_df_names = ['dataset_name', 'num_columns_imputed', 'percentage_deleted', 'imputation_strategy', 'repetition', 
                    'classification_algorithm', 'accuracy', 'auc', 'F1_score', 'Sensitivity', 'Specificity', 
                    'Statistical_Parity_Race', 'Statistical_Parity_Sex', 'Equal_Oppr_diff_Race', 'Equal_Oppr_diff_Sex', 
                    'average_abs_odds_diff_Race', 'average_abs_odds_diff_Sex', 'Disparate_Impact_Race', 'Disparate_Impact_Sex', 'Theil_Index_Race', 'Theil_Index_Sex']
results_df = pd.DataFrame(columns=results_df_names)

In [None]:
mypool = multiprocessing.Pool(12)
iterable = list(range(1, 101))
list_of_results = mypool.map(partial(nested_loop, cols1 = cols1, col2 = col2, local_data_train = local_data_train, 
                   data_orig_test = data_orig_test, percentage_list = percentage_list, imputation_types = imputation_types, 
                   imputation_type2 = imputation_type2, Classification_Models = Classification_Models, results_df = results_df, privileged_sex=privileged_sex, privileged_race=privileged_race), iterable)

In [15]:
results_df = pd.concat(list_of_results)
results_df

Unnamed: 0,dataset_name,num_columns_imputed,percentage_deleted,imputation_strategy,repetition,classification_algorithm,accuracy,auc,F1_score,Sensitivity,...,Statistical_Parity_Race,Statistical_Parity_Sex,Equal_Oppr_diff_Race,Equal_Oppr_diff_Sex,average_abs_odds_diff_Race,average_abs_odds_diff_Sex,Disparate_Impact_Race,Disparate_Impact_Sex,Theil_Index_Race,Theil_Index_Sex
0,Compas,1,0.05,SimpleImputer,1,LogisticRegression,0.856351,0.929204,0.866121,0.864424,...,-0.084356,-0.153950,-0.019557,-0.037311,0.030076,0.095082,0.855216,0.763556,0.107975,0.107975
1,Compas,1,0.05,SimpleImputer,1,RandomForestClassifier,0.884527,0.929321,0.885740,0.964322,...,-0.070819,-0.094579,-0.013611,-0.022360,0.008770,0.013578,0.861467,0.825028,0.111245,0.111245
2,Compas,1,0.05,SimpleImputer,1,CalibratedClassifierCV,0.866051,0.925633,0.873583,0.889908,...,-0.085535,-0.094800,-0.010211,-0.006735,0.031050,0.024408,0.849947,0.839235,0.106380,0.106380
3,Compas,1,0.01,SimpleImputer,1,LogisticRegression,0.857275,0.929432,0.866407,0.870540,...,-0.082889,-0.165690,-0.021189,-0.041622,0.027805,0.108679,0.856353,0.747432,0.109173,0.109173
4,Compas,1,0.01,SimpleImputer,1,RandomForestClassifier,0.885450,0.927137,0.886551,0.966361,...,-0.070107,-0.098743,-0.013611,-0.022360,0.007741,0.018696,0.862482,0.818153,0.110879,0.110879
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,Compas,11,0.01,SimpleImputer,1,RandomForestClassifier,0.876212,0.930279,0.878843,0.942915,...,-0.077265,-0.108160,-0.027938,-0.024111,0.015632,0.033112,0.853278,0.807768,0.113426,0.113426
95,Compas,11,0.01,SimpleImputer,1,CalibratedClassifierCV,0.865589,0.924513,0.872200,0.898063,...,-0.088892,-0.095393,-0.018457,-0.011449,0.033525,0.023757,0.842510,0.836182,0.109783,0.109783
96,Compas,11,0.10,SimpleImputer,1,LogisticRegression,0.862818,0.929976,0.871595,0.876656,...,-0.080799,-0.129380,-0.013252,-0.030172,0.025686,0.063518,0.859634,0.793500,0.105231,0.105231
97,Compas,11,0.10,SimpleImputer,1,RandomForestClassifier,0.878060,0.934086,0.880000,0.951070,...,-0.081356,-0.108934,-0.025911,-0.023438,0.020374,0.033681,0.844684,0.804686,0.114095,0.114095


In [11]:
# Performing the same steps as above for categorical variables
Compasdata_Categorical = Compasdata.select_dtypes(exclude = 'int64')
Compasdata_Categorical["Reoffending_Likelihood"] = Compasdata["Reoffending_Likelihood"]
Compasdata_Categorical["age"] = Compasdata["age"]

In [12]:
categorical_features = ['sex', 'Age_class', 'race_class', 'c_charge_degree', 'score_text', 'v_score_text']
data_encoded = Compasdata_Categorical.copy()

categorical_names = {}
encoders = {}

# Use Label Encoder for categorical columns (including target column)
for feature in categorical_features:
    le = LabelEncoder()
    le.fit(data_encoded[feature])
    
    data_encoded[feature] = le.transform(data_encoded[feature])
    
    categorical_names[feature] = le.classes_
    encoders[feature] = le
data_perp_sex = data_encoded

In [13]:
data_standard = StandardDataset(data_perp_sex, 
                               label_name='Reoffending_Likelihood', 
                               favorable_classes=[1], 
                               protected_attribute_names=['race_class', 'sex'], 
                               privileged_classes=[privileged_race, privileged_sex])

In [14]:
np.random.seed(123)
data_orig_train, data_orig_test = data_standard.split([0.7], shuffle=True)
LR_Classifier = LogisticRegression()
RF_Classifier = RandomForestClassifier()
SV_Classifier = LinearSVC(max_iter=1000)
SV_Classifier = CalibratedClassifierCV(SV_Classifier)
Classification_Models = [LR_Classifier, RF_Classifier, SV_Classifier]
results_pre_names = ['dataset_name', 'num_columns_imputed', 'percentage_deleted', 'imputation_strategy', 'repetition', 
                    'classification_algorithm', 'accuracy', 'auc', 'F1_score', 'Sensitivity', 'Specificity', 
                    'Statistical_Parity_Race', 'Statistical_Parity_Sex', 'Equal_Oppr_diff_Race', 'Equal_Oppr_diff_Sex', 
                    'average_abs_odds_diff_Race', 'average_abs_odds_diff_Sex', 'Disparate_Impact_Race', 'Disparate_Impact_Sex', 'Theil_Index_Race', 'Theil_Index_Sex']
results_pre_cat = pd.DataFrame(columns=results_pre_names)

num_repetitions = 100
for i in range(num_repetitions):

    for classifier in Classification_Models:
        log_reg_fit = classifier.fit(data_orig_train.features, 
                         data_orig_train.labels.ravel(), 
                         sample_weight=data_orig_train.instance_weights)
        X_test = data_orig_test.features
        y_test = data_orig_test.labels.ravel()
        y_pred = log_reg_fit.predict(X_test)
        Accuracy = metrics.accuracy_score(y_test, y_pred)

        y_pred_proba_final = log_reg_fit.predict_proba(X_test)[:,1]
        AUC = roc_auc_score(y_test, y_pred_proba_final)

        F1_score = metrics.f1_score(y_test, y_pred)
        confusion_matrix_pre = metrics.confusion_matrix(y_test, y_pred)
        TP = confusion_matrix_pre[1, 1]
        TN = confusion_matrix_pre[0, 0]
        FP = confusion_matrix_pre[0, 1]
        FN = confusion_matrix_pre[1, 0]
        Sensitivity = TP / (TP + FN) 
        Specificity = TN / (TN + FP)
        fair_final = get_fair_metrics(data_orig_test, log_reg_fit)   
        Statistical_Parity_Race = fair_final.iloc[1,0]
        Statistical_Parity_Sex = fair_final.iloc[2,0]
        Equal_Oppr_diff_Race = fair_final.iloc[1,1]
        Equal_Oppr_diff_Sex = fair_final.iloc[2,1]
        average_abs_odds_diff_Race = fair_final.iloc[1,2]
        average_abs_odds_diff_Sex = fair_final.iloc[2,2]
        Disparate_Impact_Race = fair_final.iloc[1,3]
        Disparate_Impact_Sex = fair_final.iloc[2,3]
        Theil_Index_Race = fair_final.iloc[1,4]
        Theil_Index_Sex = fair_final.iloc[2,4]
        new_row_pre = ['Compas_Pre_Cat', 0,0,'None',i, classifier.__class__.__name__, Accuracy, AUC, F1_score, Sensitivity, Specificity, Statistical_Parity_Race, 
                       Statistical_Parity_Sex, Equal_Oppr_diff_Race, Equal_Oppr_diff_Sex, average_abs_odds_diff_Race, average_abs_odds_diff_Sex, Disparate_Impact_Race, 
                       Disparate_Impact_Sex, Theil_Index_Race, Theil_Index_Sex]
        results_pre_cat.loc[len(results_pre_cat)] = new_row_pre

In [15]:
cols = list(data_orig_train.feature_names)
local_data_train = pd.DataFrame(data= data_orig_train.features, columns= cols)
local_data_test = pd.DataFrame(data= data_orig_test.features, columns= cols)
local_data_train['Reoffending_Likelihood'] = pd.DataFrame(data= data_orig_train.labels)
local_data_test['Reoffending_Likelihood'] = pd.DataFrame(data= data_orig_test.labels)

In [16]:
cols = list(data_orig_train.feature_names)
Protected_Variables = ["sex", "Reoffending_Likelihood", "race_class"]
cols1 = list(set(cols) - set(Protected_Variables))
col2 = local_data_train.columns

# For categorical variables we use only two types of imputation strategies
knn = KNNImputer(n_neighbors=2)
Most_Freq = SimpleImputer(missing_values= nan, strategy="most_frequent" )
imputation_types = [knn, Most_Freq]

LR_Classifier = LogisticRegression()
RF_Classifier = RandomForestClassifier()
SV_Classifier = LinearSVC(max_iter=1000)
SV_Classifier = CalibratedClassifierCV(SV_Classifier)

Classification_Models = [LR_Classifier, RF_Classifier, SV_Classifier]

percentage_list = [0.05,0.01,0.1]

results_df_names = ['dataset_name', 'num_columns_imputed', 'percentage_deleted', 'imputation_strategy', 'repetition', 
                    'classification_algorithm', 'accuracy', 'auc', 'F1_score', 'Sensitivity', 'Specificity', 
                    'Statistical_Parity_Race', 'Statistical_Parity_Sex', 'Equal_Oppr_diff_Race', 'Equal_Oppr_diff_Sex', 
                    'average_abs_odds_diff_Race', 'average_abs_odds_diff_Sex', 'Disparate_Impact_Race', 'Disparate_Impact_Sex', 'Theil_Index_Race', 'Theil_Index_Sex']
results_df5 = pd.DataFrame(columns=results_df_names)

In [17]:
mypool = multiprocessing.Pool(15)
iterable = list(range(1, 101))
list_of_results_2 = mypool.map(partial(nested_loop2, cols1 = cols1, col2 = col2, local_data_train = local_data_train, 
                   data_orig_test = data_orig_test, percentage_list = percentage_list, imputation_types = imputation_types, 
                   imputation_type2 = imputation_type2, Classification_Models = Classification_Models, results_df5 = results_df5, privileged_sex=privileged_sex, privileged_race=privileged_race), iterable)

[    dataset_name num_columns_imputed  percentage_deleted imputation_strategy  \
 0     Compas_Cat                   1                0.05          KNNImputer   
 1     Compas_Cat                   1                0.05          KNNImputer   
 2     Compas_Cat                   1                0.05          KNNImputer   
 3     Compas_Cat                   1                0.05       most_frequent   
 4     Compas_Cat                   1                0.05       most_frequent   
 ..           ...                 ...                 ...                 ...   
 175   Compas_Cat                   5                0.10          KNNImputer   
 176   Compas_Cat                   5                0.10          KNNImputer   
 177   Compas_Cat                   5                0.10       most_frequent   
 178   Compas_Cat                   5                0.10       most_frequent   
 179   Compas_Cat                   5                0.10       most_frequent   
 
     repetition classifica

In [None]:
results_df5 = pd.concat(list_of_results_2)

In [18]:
Results_Pre = pd.concat([results_pre, results_pre_cat])
Results_Post = pd.concat([results_df, results_df5])

In [21]:
results_df5

Unnamed: 0,dataset_name,num_columns_imputed,percentage_deleted,imputation_strategy,repetition,classification_algorithm,accuracy,auc,F1_score,Sensitivity,...,Statistical_Parity_Race,Statistical_Parity_Sex,Equal_Oppr_diff_Race,Equal_Oppr_diff_Sex,average_abs_odds_diff_Race,average_abs_odds_diff_Sex,Disparate_Impact_Race,Disparate_Impact_Sex,Theil_Index_Race,Theil_Index_Sex


In [19]:
Results_Pre.to_csv('Compas_Pre.csv')
Results_Post.to_csv('Compas_Post.csv')