In [1]:
from helpers import *

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import model_selection
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

# Compute discrimination

In [2]:
def computeDiscrimination(X_test, prediction, sensitiveAttr):
    X_test = X_test.assign(prediction = prediction)
    protectedGroup = X_test[X_test[sensitiveAttr] == 1]
    unprotectedGroup = X_test[X_test[sensitiveAttr] == 0]
    proportionOfProtected = protectedGroup['prediction'].sum() / protectedGroup[sensitiveAttr].count()
    proportionOfUnprotected = unprotectedGroup['prediction'].sum() / protectedGroup[sensitiveAttr].count()
    discrim = abs(proportionOfProtected - proportionOfUnprotected)
#     discrim = abs(proportionOfUnprotected - proportionOfProtected)
    return discrim
    

# Run Helper

In [3]:
def run(X, y, X_test, y_test, SA, test_split=0.3):
    kfold = model_selection.KFold(n_splits=5, random_state=7)
    for i, model in enumerate(models):
        cv_result = model_selection.cross_val_score(model, X, y, cv=kfold, scoring='accuracy')

        model.fit(X, y)
        prediction_test = model.predict(X_test)

        acc_score_val = np.mean(cv_result)
        acc_score_test = accuracy_score(y_test, prediction_test)
        discr_score = computeDiscrimination(X_test, prediction_test, SA)
        print ('-'*40)
        print ('val: {0}: {1}'.format(names[i], acc_score_val))
        print ('test: {0}: {1}'.format(names[i], acc_score_test))
        print('{0}: {1}'.format(names[i], discr_score))
        

# Models to Run

In [4]:
models = []
names = ['LR', 'Random Forest', 'Neural Network', 'GaussianNB', 'DecisionTreeClassifier', 'SVM']
# names = ['LR', 'DecisionTreeClassifier']

models.append((LogisticRegression(C=1000000)))
models.append((RandomForestClassifier(n_estimators=100)))
models.append((MLPClassifier()))
models.append((GaussianNB()))
models.append((DecisionTreeClassifier()))
models.append((SVC()))

# Adult Dataset

In [21]:
df_adult, pct = load_adult('datasets/adult/adult.data')
X_adult = df_adult.iloc[:, :-1]
y_adult = df_adult.iloc[:, -1]
print('percentage of corrupt rows: {0:.1f}%'.format((1-pct)*100))

df_adult_test, pct = load_adult('datasets/adult/adult.test')
X_adult_test = df_adult_test.iloc[:, :-1]
y_adult_test = df_adult_test.iloc[:, -1]
print('percentage of corrupt rows in testing: {0:.1f}%'.format((1-pct)*100))

percentage of corrupt rows: 7.4%
percentage of corrupt rows in testing: 7.5%


# expanding

In [None]:
# X_expand = pd.get_dummies(X_adult)
# run(X_expand, y_adult)

# LabelEncoder

In [22]:
encoders = {"workclass": preprocessing.LabelEncoder(), 
            "education": preprocessing.LabelEncoder(), 
            "marital-status": preprocessing.LabelEncoder(), 
            "occupation": preprocessing.LabelEncoder(), 
            "relationship": preprocessing.LabelEncoder(), 
            "race": preprocessing.LabelEncoder(), 
            "sex": preprocessing.LabelEncoder(), 
            "native-country": preprocessing.LabelEncoder()}

X_encoded = encode(X_adult, encoders)
X_encoded_test = encode(X_adult_test, encoders)

In [23]:
run(X_encoded, y_adult, X_encoded_test, y_adult_test, 'sex')



----------------------------------------
val: LR: 0.7886746347783917
test: LR: 0.7847941567065073
LR: 0.21717891308772644
----------------------------------------
val: Random Forest: 0.8534582508405361
test: Random Forest: 0.849003984063745
Random Forest: 0.4600040708324852




----------------------------------------
val: Neural Network: 0.7793241284341955
test: Neural Network: 0.7890438247011953
Neural Network: 0.37879096275188273
----------------------------------------
val: GaussianNB: 0.7885751048279556
test: GaussianNB: 0.7885790172642763
GaussianNB: 0.20435579075921023
----------------------------------------
val: DecisionTreeClassifier: 0.8041911507047226
test: DecisionTreeClassifier: 0.80199203187251
DecisionTreeClassifier: 0.532871972318339




----------------------------------------
val: SVM: 0.751176805641474
test: SVM: 0.7547144754316069
SVM: 0.004477915733767555


# German Dataset

In [19]:
# load german dataset
df_german = load_german('datasets/german/german.data-numeric')
X_german = df_german.iloc[:, :-1]
y_german = df_german.iloc[:, -1]

X_german.loc[X_german['Age'] <= 25, 'Age'] = 1
X_german.loc[X_german['Age'] > 25, 'Age'] = 0

# X_train, X_test, y_train, y_test = train_test_split(X_german, y_german, test_size=1, random_state=0)
X_train, X_test, y_train, y_test = X_german, X_german, y_german, y_german

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [9]:
X_german.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,Age,...,14,15,16,17,18,19,20,21,22,23
0,1,6,4,12,5,5,3,4,1,0,...,1,0,0,1,0,0,1,0,0,1
1,2,48,2,60,1,3,2,2,1,1,...,1,0,0,1,0,0,1,0,0,1
2,4,12,4,21,1,4,3,3,1,0,...,1,0,0,1,0,0,1,0,1,0
3,1,42,2,79,1,4,3,4,2,0,...,1,0,0,0,0,0,0,0,0,1
4,1,24,3,49,1,3,3,4,4,0,...,1,1,0,1,0,0,0,0,0,1


In [20]:
run(X_train, y_train, X_test, y_test, 'Age')



----------------------------------------
val: LR: 0.764
test: LR: 0.783
LR: 0.46842105263157896
----------------------------------------
val: Random Forest: 0.765
test: Random Forest: 1.0
Random Forest: 0.736842105263158




----------------------------------------
val: Neural Network: 0.779
test: Neural Network: 0.847
Neural Network: 0.5631578947368421
----------------------------------------
val: GaussianNB: 0.7190000000000001
test: GaussianNB: 0.74
GaussianNB: 0.5157894736842105
----------------------------------------
val: DecisionTreeClassifier: 0.689
test: DecisionTreeClassifier: 1.0
DecisionTreeClassifier: 0.736842105263158
----------------------------------------
val: SVM: 0.741
test: SVM: 0.9
SVM: 0.5157894736842106




In [None]:
# X_german[X_german['Age'] <= 25 & X_german['gender'] == 'Female'] = 1 # protected
X_german['Age'] <= 25 # protected
# print(X_german.head())
# X_german[X_german['Age'] <= 25] = 1 # protected
# X_german[X_german['Age'] > 25] = 0 # not protected
# run(X_german, y_german, 'Age')