In [1]:
from helpers import *

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import model_selection
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from CustomLogisticRegression import CustomLogisticRegression as CLR
import sklearn.discriminant_analysis as DA
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, ComplementNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC

# Compute discrimination

In [2]:
def computeDiscrimination(X_test, prediction, sensitiveAttr):
    X_test = X_test.assign(prediction = prediction)
    protectedGroup = X_test[X_test[sensitiveAttr] == 1]
    unprotectedGroup = X_test[X_test[sensitiveAttr] == 0]
    proportionOfProtected = protectedGroup['prediction'].sum() / protectedGroup[sensitiveAttr].count()
    proportionOfUnprotected = unprotectedGroup['prediction'].sum() / protectedGroup[sensitiveAttr].count()
    discrim = abs(proportionOfProtected - proportionOfUnprotected)
    print(protectedGroup['prediction'].sum())
    print(protectedGroup[sensitiveAttr].count())
    print(unprotectedGroup['prediction'].sum())
    print(protectedGroup[sensitiveAttr].count())
    return discrim
    

# Run Helper

In [3]:
def run(X, y, X_test, y_test, SA):
#     kfold = model_selection.KFold(n_splits=5, random_state=7)
    kfold = model_selection.KFold(n_splits=5)
    for i, model in enumerate(models):
        cv_result = model_selection.cross_val_score(model, X, y, cv=kfold, scoring='accuracy')

        model.fit(X, y)
        prediction_test = model.predict(X_test)

        acc_score_val = np.mean(cv_result)
        acc_score_test = accuracy_score(y_test, prediction_test)
        discr_score = computeDiscrimination(X_test, prediction_test, SA)
        print ('-'*40)
        print ('val: {0}: {1}'.format(names[i], acc_score_val))
        print ('test: {0}: {1}'.format(names[i], acc_score_test))
        print('{0} discrimination: {1}'.format(names[i], discr_score))
        

# Models to Run

In [20]:
models = []
names = ['LR(solver=lbfgs, fit_intercept=False, max_iter=700, penalty=none, C=1e10)',
         'LR(solver=lbfgs, fit_intercept=False, max_iter=1400, penalty=none, C=1e10)',
         'LR(solver=lbfgs, fit_intercept=False, max_iter=2100, penalty=none, C=1e10)',
         'LR(solver=newton-cg, fit_intercept=False, max_iter=100, penalty=none, C=1e10)',
         'LR(solver=newton-cg, fit_intercept=False, max_iter=500, penalty=none, C=1e10)',
         'LR(solver=newton-cg, fit_intercept=False, max_iter=1500, penalty=none, C=1e10)',
         'LR(solver=liblinear, fit_intercept=False, max_iter=1000, C=1e10)',
         'LR(solver=liblinear, fit_intercept=False, max_iter=2000, C=1e10)',
         'LR(solver=liblinear, fit_intercept=False, max_iter=3000, C=1e10)',
         'LR(solver=sag, fit_intercept=False, max_iter=10000, penalty=none, C=1e10)',
         'LR(solver=sag, fit_intercept=False, max_iter=20000, penalty=none, C=1e10)',
         'LR(solver=sag, fit_intercept=False, max_iter=30000, penalty=none, C=1e10)',
         'LR(solver=saga, fit_intercept=False, max_iter=700, penalty=none, C=1e10)',
         'LR(solver=saga, fit_intercept=False, max_iter=1400, penalty=none, C=1e10)',
         'LR(solver=saga, fit_intercept=False, max_iter=2100, penalty=none, C=1e10)',
         'Custom Logistic Regression(lr=0.1, max_itr=100)',
         'Custom Logistic Regression(lr=0.1, max_itr=500)',
         'Custom Logistic Regression(lr=0.1, max_itr=1500)',
         'Custom Logistic Regression(lr=0.01, max_itr=100)',
         'Custom Logistic Regression(lr=0.01, max_itr=500)',
         'Custom Logistic Regression(lr=0.01, max_itr=1500)',
         'Custom Logistic Regression(lr=0.001, max_itr=100)',
         'Custom Logistic Regression(lr=0.001, max_itr=500)',
         'Custom Logistic Regression(lr=0.001, max_itr=1500)',
         'Custom Logistic Regression(lr=0.0001, max_itr=100)',
         'Custom Logistic Regression(lr=0.0001, max_itr=500)',
         'Custom Logistic Regression(lr=0.0001, max_itr=1500)',
#          'Linear Discriminant Analysis',
#          'Quadratic Discriminant Analysis',
#          'Random Forest',
#          'Neural Network',
#          'Gaussian NB',
#          'Bernoulli NB',
#          'Complement NB',
#          'Multinomial NB',
#          'Random Forest',
#          'K Neighbors Classifier',
#          'SVM',
#          'LinearSVC'
]

models.append(LogisticRegression(solver='lbfgs', fit_intercept=False, max_iter=700, penalty='none'))
models.append(LogisticRegression(solver='lbfgs', fit_intercept=False, max_iter=1400, penalty='none'))
models.append(LogisticRegression(solver='lbfgs', fit_intercept=False, max_iter=2100, penalty='none'))
models.append(LogisticRegression(solver='newton-cg', fit_intercept=False, max_iter=100, penalty='none'))
models.append(LogisticRegression(solver='newton-cg', fit_intercept=False, max_iter=500, penalty='none'))
models.append(LogisticRegression(solver='newton-cg', fit_intercept=False, max_iter=1500, penalty='none'))
models.append(LogisticRegression(solver='liblinear', fit_intercept=False, max_iter=1000, C=1e10))
models.append(LogisticRegression(solver='liblinear', fit_intercept=False, max_iter=2000, C=1e10))
models.append(LogisticRegression(solver='liblinear', fit_intercept=False, max_iter=3000, C=1e10))
models.append(LogisticRegression(solver='sag', fit_intercept=False, max_iter=10000, penalty='none'))
models.append(LogisticRegression(solver='sag', fit_intercept=False, max_iter=20000, penalty='none'))
models.append(LogisticRegression(solver='sag', fit_intercept=False, max_iter=30000, penalty='none'))
models.append(LogisticRegression(solver='saga', fit_intercept=False, max_iter=700, penalty='none'))
models.append(LogisticRegression(solver='saga', fit_intercept=False, max_iter=1400, penalty='none'))
models.append(LogisticRegression(solver='saga', fit_intercept=False, max_iter=2100, penalty='none'))
models.append(CLR(0.1, 100))
models.append(CLR(0.1, 500))
models.append(CLR(0.1, 1500))
models.append(CLR(0.01, 100))
models.append(CLR(0.01, 500))
models.append(CLR(0.01, 1500))
models.append(CLR(0.001, 100))
models.append(CLR(0.001, 500))
models.append(CLR(0.001, 1500))
models.append(CLR(0.0001, 100))
models.append(CLR(0.0001, 500))
models.append(CLR(0.0001, 1500))
# models.append(DA.LinearDiscriminantAnalysis())
# models.append(DA.QuadraticDiscriminantAnalysis())
# models.append(RandomForestClassifier(n_estimators=100))
# models.append(MLPClassifier())
# models.append(GaussianNB())
# models.append(BernoulliNB())
# models.append(ComplementNB())
# models.append(MultinomialNB())
# models.append(DecisionTreeClassifier())
# models.append(KNeighborsClassifier(n_neighbors=50))
# models.append(SVC())
# models.append(LinearSVC())

# Adult Dataset

In [26]:
df_adult, pct = load_adult('datasets/adult/adult.data')
X_adult = df_adult.iloc[:, :-1]
y_adult = df_adult.iloc[:, -1]
print('percentage of corrupt rows: {0:.1f}%'.format((1-pct)*100))

df_adult_test, pct = load_adult('datasets/adult/adult.test')
X_adult_test = df_adult_test.iloc[:, :-1]
y_adult_test = df_adult_test.iloc[:, -1]
print('percentage of corrupt rows in testing: {0:.1f}%'.format((1-pct)*100))

percentage of corrupt rows: 7.4%
percentage of corrupt rows in testing: 7.5%


In [28]:
X_adult_test.shape

(15060, 14)

# expanding

In [65]:
X_adult_all = X_adult.append(X_adult_test)
X_adult_all_expand = pd.get_dummies(X_adult_all)
X_expand = X_adult_all_expand[0:X_adult.shape[0]]
X_expand_test = X_adult_all_expand[X_adult.shape[0]:]

# LabelEncoder

In [None]:
# encoders = {"workclass": preprocessing.LabelEncoder(), 
#             "education": preprocessing.LabelEncoder(), 
#             "marital-status": preprocessing.LabelEncoder(), 
#             "occupation": preprocessing.LabelEncoder(), 
#             "relationship": preprocessing.LabelEncoder(), 
#             "race": preprocessing.LabelEncoder(), 
#             "sex": preprocessing.LabelEncoder(), 
#             "native-country": preprocessing.LabelEncoder()}

# X_encoded = encode(X_adult, encoders)
# X_encoded_test = encode(X_adult_test, encoders)

In [66]:
run(X_expand, y_adult, X_expand_test, y_adult_test, 'sex')

295
4913
1106
4913
----------------------------------------
val: LR(solver=lbfgs, fit_intercept=False, max_iter=700, penalty=none, C=1e10): 0.7905974424045293
test: LR(solver=lbfgs, fit_intercept=False, max_iter=700, penalty=none, C=1e10): 0.7926294820717131
LR(solver=lbfgs, fit_intercept=False, max_iter=700, penalty=none, C=1e10) discrimination: 0.16507225727661307
295
4913
1106
4913
----------------------------------------
val: LR(solver=lbfgs, fit_intercept=False, max_iter=1400, penalty=none, C=1e10): 0.7905974424045293
test: LR(solver=lbfgs, fit_intercept=False, max_iter=1400, penalty=none, C=1e10): 0.7926294820717131
LR(solver=lbfgs, fit_intercept=False, max_iter=1400, penalty=none, C=1e10) discrimination: 0.16507225727661307
295
4913
1106
4913
----------------------------------------
val: LR(solver=lbfgs, fit_intercept=False, max_iter=2100, penalty=none, C=1e10): 0.7905974424045293
test: LR(solver=lbfgs, fit_intercept=False, max_iter=2100, penalty=none, C=1e10): 0.792629482071713



385
4913
2661
4913
----------------------------------------
val: LR(solver=newton-cg, fit_intercept=False, max_iter=100, penalty=none, C=1e10): 0.8479546073079055
test: LR(solver=newton-cg, fit_intercept=False, max_iter=100, penalty=none, C=1e10): 0.849003984063745
LR(solver=newton-cg, fit_intercept=False, max_iter=100, penalty=none, C=1e10) discrimination: 0.46326073682067986






392
4913
2673
4913
----------------------------------------
val: LR(solver=newton-cg, fit_intercept=False, max_iter=500, penalty=none, C=1e10): 0.8472583757943161
test: LR(solver=newton-cg, fit_intercept=False, max_iter=500, penalty=none, C=1e10): 0.8476095617529881
LR(solver=newton-cg, fit_intercept=False, max_iter=500, penalty=none, C=1e10) discrimination: 0.46427844494199066






392
4913
2673
4913
----------------------------------------
val: LR(solver=newton-cg, fit_intercept=False, max_iter=1500, penalty=none, C=1e10): 0.8472583757943161
test: LR(solver=newton-cg, fit_intercept=False, max_iter=1500, penalty=none, C=1e10): 0.8476095617529881
LR(solver=newton-cg, fit_intercept=False, max_iter=1500, penalty=none, C=1e10) discrimination: 0.46427844494199066
295
4913
1096
4913
----------------------------------------
val: LR(solver=liblinear, fit_intercept=False, max_iter=1000, C=1e10): 0.7903322233902748
test: LR(solver=liblinear, fit_intercept=False, max_iter=1000, C=1e10): 0.7930278884462152
LR(solver=liblinear, fit_intercept=False, max_iter=1000, C=1e10) discrimination: 0.16303684103399146
295
4913
1096
4913
----------------------------------------
val: LR(solver=liblinear, fit_intercept=False, max_iter=2000, C=1e10): 0.7903322233902748
test: LR(solver=liblinear, fit_intercept=False, max_iter=2000, C=1e10): 0.7930278884462152
LR(solver=liblinear, fit_intercep

KeyboardInterrupt: 

# German Dataset

In [22]:
# load german dataset
df_german = load_german('datasets/german/german.data')
X_german = df_german.iloc[:, :-1]
y_german = df_german.iloc[:, -1]

X_german.loc[X_german['Age'] <= 25, 'Age'] = 1
X_german.loc[X_german['Age'] > 25, 'Age'] = 0

# One hot encoder
X_german_encoded = pd.get_dummies(X_german)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X_german_encoded, y_german, test_size=0.3, shuffle=False)
# X_train, X_test, y_train, y_test = train_test_split(X_german, y_german, test_size=0.3, random_state=42)

In [24]:
run(X_train, y_train, X_test, y_test, 'Age')

25
58
55
58
----------------------------------------
val: LR(solver=lbfgs, fit_intercept=False, max_iter=700, penalty=none, C=1e10): 0.7385714285714287
test: LR(solver=lbfgs, fit_intercept=False, max_iter=700, penalty=none, C=1e10): 0.7566666666666667
LR(solver=lbfgs, fit_intercept=False, max_iter=700, penalty=none, C=1e10) discrimination: 0.5172413793103449
25
58
55
58
----------------------------------------
val: LR(solver=lbfgs, fit_intercept=False, max_iter=1400, penalty=none, C=1e10): 0.7385714285714287
test: LR(solver=lbfgs, fit_intercept=False, max_iter=1400, penalty=none, C=1e10): 0.7566666666666667
LR(solver=lbfgs, fit_intercept=False, max_iter=1400, penalty=none, C=1e10) discrimination: 0.5172413793103449
25
58
55
58
----------------------------------------
val: LR(solver=lbfgs, fit_intercept=False, max_iter=2100, penalty=none, C=1e10): 0.7385714285714287
test: LR(solver=lbfgs, fit_intercept=False, max_iter=2100, penalty=none, C=1e10): 0.7566666666666667
LR(solver=lbfgs, fit_



27
58
60
58
----------------------------------------
val: LR(solver=newton-cg, fit_intercept=False, max_iter=100, penalty=none, C=1e10): 0.7542857142857142
test: LR(solver=newton-cg, fit_intercept=False, max_iter=100, penalty=none, C=1e10): 0.7666666666666667
LR(solver=newton-cg, fit_intercept=False, max_iter=100, penalty=none, C=1e10) discrimination: 0.5689655172413794




27
58
60
58
----------------------------------------
val: LR(solver=newton-cg, fit_intercept=False, max_iter=500, penalty=none, C=1e10): 0.7542857142857142
test: LR(solver=newton-cg, fit_intercept=False, max_iter=500, penalty=none, C=1e10): 0.7666666666666667
LR(solver=newton-cg, fit_intercept=False, max_iter=500, penalty=none, C=1e10) discrimination: 0.5689655172413794




27
58
60
58
----------------------------------------
val: LR(solver=newton-cg, fit_intercept=False, max_iter=1500, penalty=none, C=1e10): 0.7542857142857142
test: LR(solver=newton-cg, fit_intercept=False, max_iter=1500, penalty=none, C=1e10): 0.7666666666666667
LR(solver=newton-cg, fit_intercept=False, max_iter=1500, penalty=none, C=1e10) discrimination: 0.5689655172413794
29
58
55
58
----------------------------------------
val: LR(solver=liblinear, fit_intercept=False, max_iter=1000, C=1e10): 0.7457142857142858
test: LR(solver=liblinear, fit_intercept=False, max_iter=1000, C=1e10): 0.7633333333333333
LR(solver=liblinear, fit_intercept=False, max_iter=1000, C=1e10) discrimination: 0.4482758620689655
29
58
55
58
----------------------------------------
val: LR(solver=liblinear, fit_intercept=False, max_iter=2000, C=1e10): 0.7457142857142858
test: LR(solver=liblinear, fit_intercept=False, max_iter=2000, C=1e10): 0.7633333333333333
LR(solver=liblinear, fit_intercept=False, max_iter=2000,



0
58
0
58
----------------------------------------
val: LR(solver=saga, fit_intercept=False, max_iter=700, penalty=none, C=1e10): 0.7042857142857144
test: LR(solver=saga, fit_intercept=False, max_iter=700, penalty=none, C=1e10): 0.69
LR(solver=saga, fit_intercept=False, max_iter=700, penalty=none, C=1e10) discrimination: 0.0




0
58
0
58
----------------------------------------
val: LR(solver=saga, fit_intercept=False, max_iter=1400, penalty=none, C=1e10): 0.7042857142857144
test: LR(solver=saga, fit_intercept=False, max_iter=1400, penalty=none, C=1e10): 0.69
LR(solver=saga, fit_intercept=False, max_iter=1400, penalty=none, C=1e10) discrimination: 0.0


  return 1.0 / (1 + np.exp(-a))


0
58
0
58
----------------------------------------
val: LR(solver=saga, fit_intercept=False, max_iter=2100, penalty=none, C=1e10): 0.7042857142857144
test: LR(solver=saga, fit_intercept=False, max_iter=2100, penalty=none, C=1e10): 0.69
LR(solver=saga, fit_intercept=False, max_iter=2100, penalty=none, C=1e10) discrimination: 0.0
0.0
58
0.0
58
----------------------------------------
val: Custom Logistic Regression(lr=0.1, max_itr=100): 0.39285714285714285
test: Custom Logistic Regression(lr=0.1, max_itr=100): 0.69
Custom Logistic Regression(lr=0.1, max_itr=100) discrimination: 0.0
58.0
58
242.0
58
----------------------------------------
val: Custom Logistic Regression(lr=0.1, max_itr=500): 0.45
test: Custom Logistic Regression(lr=0.1, max_itr=500): 0.31
Custom Logistic Regression(lr=0.1, max_itr=500) discrimination: 3.1724137931034484
58.0
58
242.0
58
----------------------------------------
val: Custom Logistic Regression(lr=0.1, max_itr=1500): 0.5471428571428573
test: Custom Logistic