In [1]:
from helpers import *

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import model_selection
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

# Run Helper

In [2]:
def run(X, y, X_test, y_test, test_split=0.3):
    kfold = model_selection.KFold(n_splits=5, random_state=7)
    for i, model in enumerate(models):
        cv_result = model_selection.cross_val_score(model, X, y, cv=kfold, scoring='accuracy')

        model.fit(X, y)
        prediction_test = model.predict(X_test)

        acc_score_val = np.mean(cv_result)
        acc_score_test = accuracy_score(y_test, prediction_test)
        print ('-'*40)
        print ('val: {0}: {1}'.format(names[i], acc_score_val))
        print ('test: {0}: {1}'.format(names[i], acc_score_test))

# Models to Run

In [3]:
models = []
names = ['LR']#, 'Random Forest', 'Neural Network', 'GaussianNB', 'DecisionTreeClassifier', 'SVM']

models.append((LogisticRegression(solver='lbfgs', max_iter=1000)))
# models.append((RandomForestClassifier(n_estimators=100)))
# models.append((MLPClassifier()))
# models.append((GaussianNB()))
# models.append((DecisionTreeClassifier()))
# models.append((SVC()))

# Adult Dataset

In [4]:
df_adult, pct = load_adult('datasets/adult/adult.data')
X_adult = df_adult.iloc[:, :-1]
y_adult = df_adult.iloc[:, -1]
print('percentage of corrupt rows in training: {0:.1f}%'.format((1-pct)*100))

df_adult_test, pct = load_adult('datasets/adult/adult.test')
X_adult_test = df_adult_test.iloc[:, :-1]
y_adult_test = df_adult_test.iloc[:, -1]
print('percentage of corrupt rows in testing: {0:.1f}%'.format((1-pct)*100))

percentage of corrupt rows in training: 7.4%
percentage of corrupt rows in testing: 7.5%


In [5]:
# X_age = pd.get_dummies(X_adult['age'])
# X_age.head()

In [6]:
# X_age = X_age.join(y_adult)
# X_age.head()

In [7]:
# ll = {}
# ll_ptr = []
# for _, col in enumerate(X_age):
#     if col != 'salary':
#         n_bad = len(X_age[(X_age[col] == 1) & (X_age['salary'] == 0)])
#         n_good = len(X_age[(X_age[col] == 1) & (X_age['salary'] == 1)])
#         ll[col] = {'bad': n_bad, 'good': n_good}
        
#         ll_ptr.append(n_bad/(n_bad + n_good))

# plt.hist(ll_ptr, bins=None, density=True, normed=None, histtype='bar')
# plt.show()

# expanding

In [8]:
# X_expanded = pd.get_dummies(X_adult)
# X_expanded_test = pd.get_dummies(X_adult_test)
# run(X_expanded, y_adult, X_expanded_test, y_adult_test)

# LabelEncoder

In [9]:
encoders = {"workclass": preprocessing.LabelEncoder(), 
            "education": preprocessing.LabelEncoder(), 
            "marital-status": preprocessing.LabelEncoder(), 
            "occupation": preprocessing.LabelEncoder(), 
            "relationship": preprocessing.LabelEncoder(), 
            "race": preprocessing.LabelEncoder(), 
            "native-country": preprocessing.LabelEncoder()}

X_encoded = encode(X_adult, encoders)
X_encoded_test = encode(X_adult_test, encoders)

In [10]:
run(X_encoded, y_adult, X_encoded_test, y_adult_test)

----------------------------------------
val: LR: 0.7845302702949868
test: LR: 0.7835989375830014


# German Dataset

In [11]:
# load german dataset
df_german = load_german('datasets/german/german.data-numeric')
X_german = df_german.iloc[:, :-1]
y_german = df_german.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X_german, y_german, test_size=0.2, random_state=0)

In [12]:
run(X_train, y_train, X_test, y_test)

----------------------------------------
val: LR: 0.77
test: LR: 0.765


# Discrimination

In [13]:
# SA = 'Age'
# s = 'Young'
# positive = 0

# X_german_s = X_german.copy()
# X_german_s[X_german_s['Age'] <= 25] = 1 # protected
# X_german_s[X_german_s['Age'] > 25] = 0 # not protected

# conf_aged = 0
# conf_young = 0
# for index, p in enumerate(y_german):
#     if p == positive:
#         if X_german_s.loc[index, 'Age'] == 0:
#             conf_aged += 1
#         else: # X_german_s.loc[index, 'Age'] == 1 (i.e. protected)
#             conf_young += 1

# conf_aged /= len(X_german_s[X_german_s['Age'] == 0])
# conf_young /= len(X_german_s[X_german_s['Age'] == 1])
# print(conf_aged, conf_young)
# print('DISC = {0:.4f}'.format(conf_aged - conf_young))

In [14]:
# SA = 'Age'
# s = 'Young'
# positive = 0

# X_german_s = X_german.copy()
# X_german_s[X_german_s['Age'] <= 25] = 1 # protected
# X_german_s[X_german_s['Age'] > 25] = 0 # not protected

# X_train, X_test, y_train, y_test = train_test_split(X_german_s, y_german, test_size=0.3, random_state=42)

# lr = LogisticRegression()
# lr.fit(X_train, y_train)
# pred = lr.predict(X_test)

# conf_aged = 0
# conf_young = 0
# for index, p in enumerate(pred):
#     if p == positive:
#         if X_german_s.loc[index, 'Age'] == 0:
#             conf_aged += 1
#         else: # X_german_s.loc[index, 'Age'] == 1 (i.e. protected)
#             conf_young += 1

# conf_aged /= len(X_german_s[X_german_s['Age'] == 0])
# conf_young /= len(X_german_s[X_german_s['Age'] == 1])
# print(conf_aged, conf_young)
# print('DISC = {0:.4f}'.format(conf_aged - conf_young))