In [2]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import sklearn
import os
from pandas import Series, DataFrame
from pylab import rcParams
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics 
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.impute import SimpleImputer 
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import BayesianRidge
from autoimpute.imputations import SingleImputer
from autoimpute.imputations import MultipleImputer
from numpy import nan
from numpy import isnan
from aif360.metrics import BinaryLabelDatasetMetric
from aif360.metrics import ClassificationMetric
from aif360.datasets import StandardDataset
from aif360.datasets import BinaryLabelDataset
from aif360.explainers import MetricTextExplainer
from aif360.metrics import Metric, DatasetMetric, utils
from sklearn import preprocessing
from typing import List, Union, Dict
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
import warnings
from multiprocessing import Pool
from functools import partial
import multiprocessing
import german_loop_script
from german_loop_script import nested_loop
from german_loop_script import nested_loop2
warnings.filterwarnings('ignore')



In [3]:
# Changing the Data type for Sensitive Attributes and splitting Categorical and Numerical data
Germandata = pd.read_csv('germandata.csv')
Germandata["sex"] = Germandata["sex"].astype('category')
Germandata["Age_Metric"] = Germandata["Age_Metric"].astype('category')
Germandata_Numerical = Germandata.select_dtypes(exclude = 'object')

In [4]:
# Encoding the Categorical Variables using Label Encoder
categorical_features = ['sex', 'Age_Metric']
data_encoded = Germandata_Numerical.copy()

categorical_names = {}
encoders = {}

# Use Label Encoder for categorical columns (including target column)
for feature in categorical_features:
    le = LabelEncoder()
    le.fit(data_encoded[feature])
    
    data_encoded[feature] = le.transform(data_encoded[feature])
    
    categorical_names[feature] = le.classes_
    encoders[feature] = le
data_perp_sex = data_encoded

In [5]:
# Converting the Dataframe to Standard Dataset supported by AIF360
privileged_sex = np.where(categorical_names['sex'] == 'male')[0]
privileged_age = np.where(categorical_names['Age_Metric'] == 'Adult')[0]
data_standard = StandardDataset(data_perp_sex, 
                               label_name='credit', 
                               favorable_classes=[1], 
                               protected_attribute_names=['sex', 'Age_Metric'], 
                               privileged_classes=[privileged_sex, privileged_age])

In [6]:
def fair_metrics(dataset, pred, pred_is_dataset=False):
    if pred_is_dataset:                          
        dataset_pred = pred
    else:
        dataset_pred = dataset.copy()
        dataset_pred.labels = pred
# Checking if there exists a dataset with only predictions in the previous condition
    cols = ['statistical_parity_difference', 'equal_opportunity_difference', 'average_abs_odds_difference',  'disparate_impact', 'theil_index']
    obj_fairness = [[0,0,0,1,0]]
    
    fair_metrics = pd.DataFrame(data=obj_fairness, index=['objective'], columns=cols)
    
    for attr in dataset_pred.protected_attribute_names:
        idx = dataset_pred.protected_attribute_names.index(attr)
        privileged_groups =  [{attr:dataset_pred.privileged_protected_attributes[idx][0]}] 
        unprivileged_groups = [{attr:dataset_pred.unprivileged_protected_attributes[idx][0]}] 
        
# We need to use Classification Metric for calculating 3 Metrics (Equal Oppr, Theil Index & Avg Odds Diff) & BinaryLabel Dataset Metric for the rest        
        classified_metric = ClassificationMetric(dataset, 
                                                     dataset_pred,
                                                     unprivileged_groups=unprivileged_groups,
                                                     privileged_groups=privileged_groups)

        metric_pred = BinaryLabelDatasetMetric(dataset_pred,
                                                     unprivileged_groups=unprivileged_groups,
                                                     privileged_groups=privileged_groups)

        acc = classified_metric.accuracy()

        row = pd.DataFrame([[metric_pred.mean_difference(),
                                classified_metric.equal_opportunity_difference(),
                                classified_metric.average_abs_odds_difference(),
                                metric_pred.disparate_impact(),
                                classified_metric.theil_index()]],
                           columns  = cols,
                           index = [attr]
                          )
        fair_metrics = fair_metrics.append(row)    
    
    fair_metrics = fair_metrics.replace([-np.inf, np.inf], 2)
        
    return fair_metrics

In [7]:
# Defining a Function to return fairness Metrics for various Classification Models
def get_fair_metrics(data, model, plot=True, model_aif=False):
    pred = model.predict(data).labels if model_aif else model.predict(data.features)
    fair = fair_metrics(data, pred)
    return fair

In [8]:
# Nested For Loop, Calculating the Fairness Metrics before Imputation on the Original Dataset
# We are using three classification Models to calculate the fairness metrics and store the results in a dataframe
np.random.seed(123)
data_orig_train, data_orig_test = data_standard.split([0.7], shuffle=True)
LR_Classifier = LogisticRegression()
RF_Classifier = RandomForestClassifier()
SV_Classifier = LinearSVC(max_iter=10000)
SV_Classifier = CalibratedClassifierCV(SV_Classifier)
Classification_Models = [LR_Classifier, RF_Classifier, SV_Classifier]
results_pre_names = ['dataset_name', 'num_columns_imputed', 'percentage_deleted', 'imputation_strategy', 'repetition', 
                    'classification_algorithm', 'accuracy', 'auc', 'F1_score', 'Sensitivity', 'Specificity', 
                    'Statistical_Parity_Sex', 'Statistical_Parity_Age', 'Equal_Oppr_diff_Sex', 'Equal_Oppr_diff_Age', 
                    'average_abs_odds_diff_Sex', 'average_abs_odds_diff_Age', 'Disparate_Impact_Sex', 'Disparate_Impact_Age', 'Theil_Index_Sex', 'Theil_Index_Age']
results_pre = pd.DataFrame(columns=results_pre_names)

num_repetitions = 100

for i in range(num_repetitions):
    for classifier in Classification_Models:
        log_reg_fit = classifier.fit(data_orig_train.features, 
                         data_orig_train.labels.ravel(), 
                         sample_weight=data_orig_train.instance_weights)
        X_test_credit = data_orig_test.features
        y_test_credit = data_orig_test.labels.ravel()
        Germancredit_pred = log_reg_fit.predict(X_test_credit)

        Germancredit_pred_proba_final = log_reg_fit.predict_proba(X_test_credit)[:,1]
        AUC = roc_auc_score(y_test_credit, Germancredit_pred_proba_final)

        Accuracy = metrics.accuracy_score(y_test_credit, Germancredit_pred)
        F1_score = metrics.f1_score(y_test_credit, Germancredit_pred)
        confusion_matrix_pre = metrics.confusion_matrix(y_test_credit, Germancredit_pred)
        TP = confusion_matrix_pre[1, 1]
        TN = confusion_matrix_pre[0, 0]
        FP = confusion_matrix_pre[0, 1]
        FN = confusion_matrix_pre[1, 0]
        Sensitivity = TP / (TP + FN) 
        Specificity = TN / (TN + FP)
        fair_final = get_fair_metrics(data_orig_test, log_reg_fit)   
        Statistical_Parity_Sex = fair_final.iloc[1,0]
        Statistical_Parity_Age = fair_final.iloc[2,0]
        Equal_Oppr_diff_Sex = fair_final.iloc[1,1]
        Equal_Oppr_diff_Age = fair_final.iloc[2,1]
        average_abs_odds_diff_Sex = fair_final.iloc[1,2]
        average_abs_odds_diff_Age = fair_final.iloc[2,2]
        Disparate_Impact_Sex = fair_final.iloc[1,3]
        Disparate_Impact_Age = fair_final.iloc[2,3]
        Theil_Index_Sex = fair_final.iloc[1,4]
        Theil_Index_Age = fair_final.iloc[2,4]
        new_row_pre = [ 'German_Pre', 0, 0, 'None', i, classifier.__class__.__name__, Accuracy, AUC, F1_score, Sensitivity, Specificity, Statistical_Parity_Sex, 
                       Statistical_Parity_Age, Equal_Oppr_diff_Sex, Equal_Oppr_diff_Age, average_abs_odds_diff_Sex, average_abs_odds_diff_Age, Disparate_Impact_Sex, 
                       Disparate_Impact_Age, Theil_Index_Sex, Theil_Index_Age]
        results_pre.loc[len(results_pre)] = new_row_pre

In [9]:
# Converting the Standard Dataset back to a Dataframe, that we can use for imputations
cols = list(data_orig_train.feature_names)
local_data_train = pd.DataFrame(data= data_orig_train.features, columns= cols)
local_data_test = pd.DataFrame(data= data_orig_test.features, columns= cols)
local_data_train['credit'] = pd.DataFrame(data= data_orig_train.labels)
local_data_test['credit'] = pd.DataFrame(data= data_orig_test.labels)

Protected_Variables = ["sex", "age", "credit", "Age_Metric"]
cols1 = list(set(cols) - set(Protected_Variables))
col2 = local_data_train.columns

# Defining all the imputations that we are using on Numerical variables
Simple_Mean = SimpleImputer(missing_values= nan, strategy='mean')
knn = KNNImputer(n_neighbors=2)
#Most_Freq = SimpleImputer(missing_values= nan, strategy="most_frequent" )
Simple_Median = SimpleImputer(missing_values= nan, strategy='median')
Iterative = IterativeImputer(BayesianRidge())
linear_Auto = SingleImputer(strategy="interpolate")
lm_ft = SingleImputer(strategy="least squares")
stoch_ft = SingleImputer(strategy="stochastic")
# pmm_ft = SingleImputer(strategy="pmm")
norm_ft = SingleImputer(strategy="norm")

imputation_types = [Simple_Mean, knn, Simple_Median, Iterative, linear_Auto, lm_ft, stoch_ft, norm_ft]
imputation_type2 = [linear_Auto, lm_ft, stoch_ft, norm_ft]

LR_Classifier = LogisticRegression()
RF_Classifier = RandomForestClassifier()
SV_Classifier = LinearSVC(max_iter=10000)
SV_Classifier = CalibratedClassifierCV(SV_Classifier)

Classification_Models = [LR_Classifier, RF_Classifier, SV_Classifier]

# The percent of data that we need to delete at random
percentage_list = [0.05,0.01,0.1] 

results_df_names = ['dataset_name', 'num_columns_imputed', 'percentage_deleted', 'imputation_strategy', 'repetition', 
                    'classification_algorithm', 'accuracy', 'auc', 'F1_score', 'Sensitivity', 'Specificity', 
                    'Statistical_Parity_Sex', 'Statistical_Parity_Age', 'Equal_Oppr_diff_Sex', 'Equal_Oppr_diff_Age', 
                    'average_abs_odds_diff_Sex', 'average_abs_odds_diff_Age', 'Disparate_Impact_Sex', 'Disparate_Impact_Age', 'Theil_Index_Sex', 'Theil_Index_Age']
results_df = pd.DataFrame(columns=results_df_names)

In [None]:
mypool = multiprocessing.Pool(10)
iterable = list(range(1, 101))
list_of_results = mypool.map(partial(nested_loop, cols1 = cols1, col2 = col2, local_data_train = local_data_train, 
                   data_orig_test = data_orig_test, percentage_list = percentage_list, imputation_types = imputation_types, 
                   imputation_type2 = imputation_type2, Classification_Models = Classification_Models, results_df = results_df, privileged_sex=privileged_sex, privileged_age=privileged_age), iterable)

In [None]:
results_df = pd.concat(list_of_results)

In [10]:
# Performing the same steps as above for categorical variables
Germandata_Categorical = Germandata.select_dtypes(exclude = 'int64')
Germandata_Categorical["credit"] = Germandata["credit"]
Germandata_Categorical["age"] = Germandata["age"]

In [11]:
categorical_features = ['sex', 'Age_Metric', 'status', 'credit_history', 'purpose', 'savings', 'employment', 'other_debtors', 'property', 'installment_plans', 'housing', 'skill_level', 'telephone', 'foreign_worker']
data_encoded = Germandata_Categorical.copy()

categorical_names = {}
encoders = {}

# Use Label Encoder for categorical columns (including target column)
for feature in categorical_features:
    le = LabelEncoder()
    le.fit(data_encoded[feature])
    
    data_encoded[feature] = le.transform(data_encoded[feature])
    
    categorical_names[feature] = le.classes_
    encoders[feature] = le
data_perp_sex = data_encoded

In [12]:
data_standard = StandardDataset(data_perp_sex, 
                               label_name='credit', 
                               favorable_classes=[1], 
                               protected_attribute_names=['sex', 'Age_Metric'], 
                               privileged_classes=[privileged_sex, privileged_age])

In [13]:
np.random.seed(123)
data_orig_train, data_orig_test = data_standard.split([0.7], shuffle=True)
LR_Classifier = LogisticRegression()
RF_Classifier = RandomForestClassifier()
SV_Classifier = LinearSVC(max_iter=10000)
SV_Classifier = CalibratedClassifierCV(SV_Classifier)
Classification_Models = [LR_Classifier, RF_Classifier, SV_Classifier]
results_pre_names = ['dataset_name', 'num_columns_imputed', 'percentage_deleted', 'imputation_strategy', 'repetition', 
                    'classification_algorithm', 'accuracy', 'auc', 'F1_score', 'Sensitivity', 'Specificity', 
                    'Statistical_Parity_Sex', 'Statistical_Parity_Age', 'Equal_Oppr_diff_Sex', 'Equal_Oppr_diff_Age', 
                    'average_abs_odds_diff_Sex', 'average_abs_odds_diff_Age', 'Disparate_Impact_Sex', 'Disparate_Impact_Age', 'Theil_Index_Sex', 'Theil_Index_Age']
results_pre_cat = pd.DataFrame(columns=results_pre_names)

num_repetitions = 100

for i in range(num_repetitions):
    print('repetition: ', i)
    for classifier in Classification_Models:
        log_reg_fit = classifier.fit(data_orig_train.features, 
                         data_orig_train.labels.ravel(), 
                         sample_weight=data_orig_train.instance_weights)
        X_test_credit = data_orig_test.features
        y_test_credit = data_orig_test.labels.ravel()
        Germancredit_pred = log_reg_fit.predict(X_test_credit)

        Germancredit_pred_proba_final = log_reg_fit.predict_proba(X_test_credit)[:,1]
        AUC = roc_auc_score(y_test_credit, Germancredit_pred_proba_final)

        Accuracy = metrics.accuracy_score(y_test_credit, Germancredit_pred)
        F1_score = metrics.f1_score(y_test_credit, Germancredit_pred)
        confusion_matrix_pre = metrics.confusion_matrix(y_test_credit, Germancredit_pred)
        TP = confusion_matrix_pre[1, 1]
        TN = confusion_matrix_pre[0, 0]
        FP = confusion_matrix_pre[0, 1]
        FN = confusion_matrix_pre[1, 0]
        Sensitivity = TP / (TP + FN) 
        Specificity = TN / (TN + FP)
        fair_final = get_fair_metrics(data_orig_test, log_reg_fit)   
        Statistical_Parity_Sex = fair_final.iloc[1,0]
        Statistical_Parity_Age = fair_final.iloc[2,0]
        Equal_Oppr_diff_Sex = fair_final.iloc[1,1]
        Equal_Oppr_diff_Age = fair_final.iloc[2,1]
        average_abs_odds_diff_Sex = fair_final.iloc[1,2]
        average_abs_odds_diff_Age = fair_final.iloc[2,2]
        Disparate_Impact_Sex = fair_final.iloc[1,3]
        Disparate_Impact_Age = fair_final.iloc[2,3]
        Theil_Index_Sex = fair_final.iloc[1,4]
        Theil_Index_Age = fair_final.iloc[2,4]
        new_row_pre = [ 'German_Pre_Cat', 0, 0, 'None', i, classifier.__class__.__name__, Accuracy, AUC, F1_score, Sensitivity, Specificity, Statistical_Parity_Sex, 
                       Statistical_Parity_Age, Equal_Oppr_diff_Sex, Equal_Oppr_diff_Age, average_abs_odds_diff_Sex, average_abs_odds_diff_Age, Disparate_Impact_Sex, 
                       Disparate_Impact_Age, Theil_Index_Sex, Theil_Index_Age]
        results_pre_cat.loc[len(results_pre_cat)] = new_row_pre

In [14]:
cols = list(data_orig_train.feature_names)
local_data_train = pd.DataFrame(data= data_orig_train.features, columns= cols)
local_data_test = pd.DataFrame(data= data_orig_test.features, columns= cols)
local_data_train['credit'] = pd.DataFrame(data= data_orig_train.labels)
local_data_test['credit'] = pd.DataFrame(data= data_orig_test.labels)

In [15]:
cols = list(data_orig_train.feature_names)
Protected_Variables = ["sex", "age", "credit", "Age_Metric"]
cols1 = list(set(cols) - set(Protected_Variables))
col2 = local_data_train.columns

# For categorical variables we use only two types of imputation strategies
knn = KNNImputer(n_neighbors=2)
Most_Freq = SimpleImputer(missing_values= nan, strategy="most_frequent" )
imputation_types = [knn, Most_Freq]

LR_Classifier = LogisticRegression()
RF_Classifier = RandomForestClassifier()
SV_Classifier = LinearSVC(max_iter=10000)
SV_Classifier = CalibratedClassifierCV(SV_Classifier)

Classification_Models = [LR_Classifier, RF_Classifier, SV_Classifier]

percentage_list = [0.05,0.01,0.1]

results_df_names = ['dataset_name', 'num_columns_imputed', 'percentage_deleted', 'imputation_strategy', 'repetition', 
                    'classification_algorithm', 'accuracy', 'auc', 'F1_score', 'Sensitivity', 'Specificity', 
                    'Statistical_Parity_Sex', 'Statistical_Parity_Age', 'Equal_Oppr_diff_Sex', 'Equal_Oppr_diff_Age', 
                    'average_abs_odds_diff_Sex', 'average_abs_odds_diff_Age', 'Disparate_Impact_Sex', 'Disparate_Impact_Age', 'Theil_Index_Sex', 'Theil_Index_Age']
results_df5 = pd.DataFrame(columns=results_df_names)

In [None]:
mypool = multiprocessing.Pool(10)
iterable = list(range(1, 101))
list_of_results_categorical = mypool.map(partial(nested_loop2, cols1 = cols1, col2 = col2, local_data_train = local_data_train, 
                   data_orig_test = data_orig_test, percentage_list = percentage_list, imputation_types = imputation_types, 
                   imputation_type2 = imputation_type2, Classification_Models = Classification_Models, results_df5 = results_df5, privileged_sex=privileged_sex, privileged_age=privileged_age), iterable)

In [None]:
results_df5 = pd.concat(list_of_results_categorical)

In [21]:
Results_Pre = pd.concat([results_pre, results_pre_cat])
Results_Post = pd.concat([results_df, results_df5])

In [22]:
Results_Pre.to_csv('German_Pre.csv')
Results_Post.to_csv('German_Post.csv')