In [1]:
from helpers import *

import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import model_selection
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from CustomLogisticRegression import CustomLogisticRegression as CLR
import sklearn.discriminant_analysis as DA
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, ComplementNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC

# Compute discrimination

In [11]:
def computeDiscrimination(X, prediction, sensitiveAttr):
#     X = X.assign(prediction = prediction)
    X = np.append(X, prediction.reshape(-1, 1), axis=1)
    protectedGroup = X[X[:, sensitiveAttr] == 1]
    unprotectedGroup = X[X[:, sensitiveAttr] == 0]
    protectedCount = np.count_nonzero(protectedGroup[:, sensitiveAttr] == 1)
    unprotectedCount = X.shape[0] - protectedCount
    proportionOfProtected = protectedGroup[:, -1].sum() / protectedCount
    proportionOfUnprotected = unprotectedGroup[:,-1].sum() /unprotectedCount
    discrim = abs(proportionOfProtected - proportionOfUnprotected)
    return discrim
    

# Run Helper

In [None]:
def run(X, y, X_test, y_test, SA):
#     kfold = model_selection.KFold(n_splits=5, random_state=7)
    kfold = model_selection.StratifiedKFold(n_splits=5)
    for i, model in enumerate(models):
        cv_result = model_selection.cross_val_score(model, X, y, cv=kfold, scoring='accuracy')

        model.fit(X, y)
        prediction_test = model.predict(X_test)

        acc_score_val = np.mean(cv_result)
        acc_score_test = accuracy_score(y_test, prediction_test)
        discr_score = computeDiscrimination(X_test, prediction_test, SA)
        print ('-'*40)
        print ('val: {0}: {1}'.format(names[i], acc_score_val))
        print ('test: {0}: {1}'.format(names[i], acc_score_test))
        print('{0} discrimination: {1}'.format(names[i], discr_score))
        

# Runner with the data splitting according to the report

In [None]:
def runAdultWithSplitting(X, y, X_test, y_test, SA):
    X_splits = np.array_split(X, 5)
    y_splits = np.array_split(y, 5)
    for x_split, y_split in zip(X_splits, y_splits):
        val_amount = math.floor((1/3) * x_split.shape[0]) # 1/3 of 1 split set
        X_val = x_split[0:val_amount]
        y_val = y_split[0:val_amount]
        X_train = x_split[val_amount:]
        y_train = y_split[val_amount:]
#         splitSize = x_split.shape[0]
#         X_train = x_split[0:splitSize-val_amount]
#         y_train = y_split[0:splitSize-val_amount]
#         X_val = x_split[splitSize-val_amount:]
#         y_val = y_split[splitSize-val_amount:]
        for i, model in enumerate(models):
            model.fit(X_train, y_train)
            prediction_val = model.predict(X_val)
            acc_score_val = np.mean(y_val == prediction_val)
            prediction_test = model.predict(X_test)
            acc_score_test = accuracy_score(y_test, prediction_test)
            discr_score = computeDiscrimination(X_test, prediction_test, SA)
            print ('-'*40)
            print ('val: {0}: {1}'.format(names[i], acc_score_val))
            print ('test: {0}: {1}'.format(names[i], acc_score_test))
            print('{0} discrimination: {1}'.format(names[i], discr_score))
        print ('-'*80)
        print ('-'*80)

In [15]:
def runGermanWithSplitting(X, y, SA): # SA - sensitive attribute
    X_test = np.empty_like(X[:0])
    y_test = np.empty_like(y[:0])
    val_acc = []
    val_discr = []
    X_splits = np.array_split(X, 5)
    y_splits = np.array_split(y, 5)
    for i, model in enumerate(models):
        valStartPos = 0
        valEndPos = 0
        for x_split, y_split in zip(X_splits, y_splits):
            valEndPos += math.floor(0.2 * x_split.shape[0]) # 20% of 1 split set
            train_amount = math.floor(0.5 * x_split.shape[0]) # 50% of 1 split set (round)
            X_val = x_split[valStartPos:valEndPos] # 20% of the split
            y_val = y_split[valStartPos:valEndPos]
            # the remaining 80% of the split
            remaining_X_subset = np.array(x_split[:valStartPos])
            remaining_X_subset = np.append(remaining_X_subset, x_split[valEndPos:], axis=0)
            remaining_y_subset = np.array(y_split[:valStartPos])
            remaining_y_subset = np.append(remaining_y_subset, y_split[valEndPos:], axis=0)
            X_train = remaining_X_subset[0:train_amount] # 50% of 1 split set (test set)
            y_train = remaining_y_subset[0:train_amount]
            X_test = np.append(X_test, remaining_X_subset[train_amount:], axis=0)
            y_test = np.append(y_test, remaining_y_subset[train_amount:], axis=0)
            model.fit(X_train, y_train)        
            prediction_val = model.predict(X_val)
            acc_score_val = np.mean(y_val == prediction_val)
            val_acc.append(acc_score_val)
            discr_score_val = computeDiscrimination(X_val, prediction_val, SA)
            val_discr.append(discr_score_val)
            # slide the start position of the validation set            
            valStartPos += math.floor(0.2 * x_split.shape[0]) # 20% of 1 split set
        # calculate the mean of accuracy and discrimination based on validation dataset    
        val_acc_score = np.mean(val_acc)
        val_discr_acc_score = np.mean(val_discr)
        
        prediction_test = model.predict(X_test)
        acc_score_test = accuracy_score(y_test, prediction_test)
        discr_score_test = computeDiscrimination(X_test, prediction_test, SA)
        delta = acc_score_test - discr_score_test
        print ('-'*40)
        print ('val: {0}: {1}'.format(names[i], val_acc_score))
        print ('test: {0}: {1}'.format(names[i], acc_score_test))
        print('{0} discrimination val: {1}'.format(names[i], val_discr_acc_score))
        print('{0} discrimination test: {1}'.format(names[i], discr_score_test))
        print('delta: {0}'.format(delta))
        print ('-'*100)
        print ('-'*100)

# Models to Run

In [4]:
models = []
names = ['LR(solver=lbfgs, fit_intercept=False, max_iter=700, penalty=none, C=1e10)',
         'LR(solver=lbfgs, fit_intercept=False, max_iter=1400, penalty=none, C=1e10)',
         'LR(solver=lbfgs, fit_intercept=False, max_iter=2100, penalty=none, C=1e10)',
         'LR(solver=newton-cg, fit_intercept=False, max_iter=100, penalty=none, C=1e10)',
         'LR(solver=newton-cg, fit_intercept=False, max_iter=500, penalty=none, C=1e10)',
         'LR(solver=newton-cg, fit_intercept=False, max_iter=1500, penalty=none, C=1e10)',
         'LR(solver=liblinear, fit_intercept=False, max_iter=1000, C=1e10)',
         'LR(solver=liblinear, fit_intercept=False, max_iter=2000, C=1e10)',
         'LR(solver=liblinear, fit_intercept=False, max_iter=3000, C=1e10)',
         'LR(solver=sag, fit_intercept=False, max_iter=1000, penalty=none, C=1e10)',
         'LR(solver=sag, fit_intercept=False, max_iter=3000, penalty=none, C=1e10)',
         'LR(solver=sag, fit_intercept=False, max_iter=5000, penalty=none, C=1e10)',
         'LR(solver=saga, fit_intercept=False, max_iter=700, penalty=none, C=1e10)',
         'LR(solver=saga, fit_intercept=False, max_iter=1400, penalty=none, C=1e10)',
         'LR(solver=saga, fit_intercept=False, max_iter=2100, penalty=none, C=1e10)',
         'Custom Logistic Regression(lr=0.1, max_itr=100)',
         'Custom Logistic Regression(lr=0.1, max_itr=500)',
         'Custom Logistic Regression(lr=0.1, max_itr=1500)',
         'Custom Logistic Regression(lr=0.01, max_itr=100)',
         'Custom Logistic Regression(lr=0.01, max_itr=500)',
         'Custom Logistic Regression(lr=0.01, max_itr=1500)',
         'Custom Logistic Regression(lr=0.001, max_itr=100)',
         'Custom Logistic Regression(lr=0.001, max_itr=500)',
         'Custom Logistic Regression(lr=0.001, max_itr=1500)',
         'Custom Logistic Regression(lr=0.0001, max_itr=100)',
         'Custom Logistic Regression(lr=0.0001, max_itr=500)',
         'Custom Logistic Regression(lr=0.0001, max_itr=1500)',
         'Regularized logistic regression(): '
#          'Linear Discriminant Analysis',
#          'Quadratic Discriminant Analysis',
#          'Random Forest',
#          'Neural Network',
#          'Gaussian NB',
#          'Bernoulli NB',
#          'Complement NB',
#          'Multinomial NB',
#          'Random Forest',
#          'K Neighbors Classifier',
#          'SVM',
#          'LinearSVC'
]

models.append(LogisticRegression(solver='lbfgs', fit_intercept=False, max_iter=700, penalty='none'))
models.append(LogisticRegression(solver='lbfgs', fit_intercept=False, max_iter=1400, penalty='none'))
models.append(LogisticRegression(solver='lbfgs', fit_intercept=False, max_iter=2100, penalty='none'))
models.append(LogisticRegression(solver='newton-cg', fit_intercept=False, max_iter=100, penalty='none'))
models.append(LogisticRegression(solver='newton-cg', fit_intercept=False, max_iter=500, penalty='none'))
models.append(LogisticRegression(solver='newton-cg', fit_intercept=False, max_iter=1500, penalty='none'))
models.append(LogisticRegression(solver='liblinear', fit_intercept=False, max_iter=1000, C=1e10))
models.append(LogisticRegression(solver='liblinear', fit_intercept=False, max_iter=2000, C=1e10))
models.append(LogisticRegression(solver='liblinear', fit_intercept=False, max_iter=3000, C=1e10))
models.append(LogisticRegression(solver='sag', fit_intercept=False, max_iter=1000, penalty='none'))
models.append(LogisticRegression(solver='sag', fit_intercept=False, max_iter=3000, penalty='none'))
models.append(LogisticRegression(solver='sag', fit_intercept=False, max_iter=5000, penalty='none'))
models.append(LogisticRegression(solver='saga', fit_intercept=False, max_iter=700, penalty='none'))
models.append(LogisticRegression(solver='saga', fit_intercept=False, max_iter=1400, penalty='none'))
models.append(LogisticRegression(solver='saga', fit_intercept=False, max_iter=2100, penalty='none'))
models.append(CLR(0.1, 100))
models.append(CLR(0.1, 500))
models.append(CLR(0.1, 1500))
models.append(CLR(0.01, 100))
models.append(CLR(0.01, 500))
models.append(CLR(0.01, 1500))
models.append(CLR(0.001, 100))
models.append(CLR(0.001, 500))
models.append(CLR(0.001, 1500))
models.append(CLR(0.0001, 100))
models.append(CLR(0.0001, 500))
models.append(CLR(0.0001, 1500))
models.append(LogisticRegression())
# models.append(DA.LinearDiscriminantAnalysis())
# models.append(DA.QuadraticDiscriminantAnalysis())
# models.append(RandomForestClassifier(n_estimators=100))
# models.append(MLPClassifier())
# models.append(GaussianNB())
# models.append(BernoulliNB())
# models.append(ComplementNB())
# models.append(MultinomialNB())
# models.append(DecisionTreeClassifier())
# models.append(KNeighborsClassifier(n_neighbors=50))
# models.append(SVC())
# models.append(LinearSVC())

# Adult Dataset

In [None]:
df_adult, pct = load_adult('datasets/adult/adult.data')
X_adult = df_adult.iloc[:, :-1]
y_adult = df_adult.iloc[:, -1]
print('percentage of corrupt rows: {0:.1f}%'.format((1-pct)*100))

df_adult_test, pct = load_adult('datasets/adult/adult.test')
X_adult_test = df_adult_test.iloc[:, :-1]
y_adult_test = df_adult_test.iloc[:, -1]
print('percentage of corrupt rows in testing: {0:.1f}%'.format((1-pct)*100))

# expanding

In [None]:
# X_adult['sex'] = X_adult['sex'].map({'Female': 1, 'Male': 0}).astype(int)
# X_adult_test['sex'] = X_adult_test['sex'].map({'Female': 1, 'Male': 0}).astype(int)

X_adult, X_adult_test = encode_adult(X_adult, X_adult_test)

X_adult_all = X_adult.append(X_adult_test)
X_adult_all_expand = pd.get_dummies(X_adult_all)

# X_expand, X_expand_test = encode_adult(X_expand, X_expand_test)
X_adult_expand = X_adult_all_expand[0:X_adult.shape[0]]
X_adult_expand_test = X_adult_all_expand[X_adult.shape[0]:]

# LabelEncoder

In [None]:
# run(X_expand, y_adult, X_expand_test, y_adult_test, 'sex')
runAdultWithSplitting(X_adult_expand, y_adult, X_adult_expand_test, y_adult_test, 'sex')

# German dataset

In [5]:
# load german dataset
df_german = load_german('datasets/german/german.data')
X_german = df_german.iloc[:, :-1]
y_german = df_german.iloc[:, -1]

# One hot encoder
X_german_encoded = pd.get_dummies(X_german)

In [6]:
# X_train, X_test, y_train, y_test = train_test_split(X_german_encoded, y_german, test_size=0.3, shuffle=False)
# X_train, X_test, y_train, y_test = train_test_split(X_german, y_german, test_size=0.3, random_state=42)
# X_german, X_german_test = encode_german(X_german, X_german_test)
X_german_all_expand = encode_german_all(X_german_encoded)

# X_expand, X_expand_test = encode_adult(X_expand, X_expand_test)

In [16]:
# run(X_train, y_train, X_test, y_test, 'Age')
runGermanWithSplitting(X_german_all_expand, y_german, 12);

----------------------------------------
val: LR(solver=lbfgs, fit_intercept=False, max_iter=700, penalty=none, C=1e10): 0.655
test: LR(solver=lbfgs, fit_intercept=False, max_iter=700, penalty=none, C=1e10): 0.7
LR(solver=lbfgs, fit_intercept=False, max_iter=700, penalty=none, C=1e10) discrimination val: 0.37403285638579753
LR(solver=lbfgs, fit_intercept=False, max_iter=700, penalty=none, C=1e10) discrimination test: 0.029550827423167836
delta: 0.6704491725768321
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
----------------------------------------
val: LR(solver=lbfgs, fit_intercept=False, max_iter=1400, penalty=none, C=1e10): 0.6549999999999999
test: LR(solver=lbfgs, fit_intercept=False, max_iter=1400, penalty=none, C=1e10): 0.7
LR(solver=lbfgs, fit_intercept=False, max_iter=1400, penalty=none, C=1e10) discrimination val: 0.374032



----------------------------------------
val: LR(solver=sag, fit_intercept=False, max_iter=1000, penalty=none, C=1e10): 0.639
test: LR(solver=sag, fit_intercept=False, max_iter=1000, penalty=none, C=1e10): 0.68
LR(solver=sag, fit_intercept=False, max_iter=1000, penalty=none, C=1e10) discrimination val: 0.40058864294158414
LR(solver=sag, fit_intercept=False, max_iter=1000, penalty=none, C=1e10) discrimination test: 0.08156028368794332
delta: 0.5984397163120567
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------




----------------------------------------
val: LR(solver=sag, fit_intercept=False, max_iter=3000, penalty=none, C=1e10): 0.64
test: LR(solver=sag, fit_intercept=False, max_iter=3000, penalty=none, C=1e10): 0.6833333333333333
LR(solver=sag, fit_intercept=False, max_iter=3000, penalty=none, C=1e10) discrimination val: 0.39400387635681755
LR(solver=sag, fit_intercept=False, max_iter=3000, penalty=none, C=1e10) discrimination test: 0.08510638297872342
delta: 0.5982269503546099
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------




----------------------------------------
val: LR(solver=sag, fit_intercept=False, max_iter=5000, penalty=none, C=1e10): 0.6408333333333333
test: LR(solver=sag, fit_intercept=False, max_iter=5000, penalty=none, C=1e10): 0.68
LR(solver=sag, fit_intercept=False, max_iter=5000, penalty=none, C=1e10) discrimination val: 0.388516570869512
LR(solver=sag, fit_intercept=False, max_iter=5000, penalty=none, C=1e10) discrimination test: 0.08156028368794332
delta: 0.5984397163120567
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------




----------------------------------------
val: LR(solver=saga, fit_intercept=False, max_iter=700, penalty=none, C=1e10): 0.6415384615384615
test: LR(solver=saga, fit_intercept=False, max_iter=700, penalty=none, C=1e10): 0.6766666666666666
LR(solver=saga, fit_intercept=False, max_iter=700, penalty=none, C=1e10) discrimination val: 0.38278598821585247
LR(solver=saga, fit_intercept=False, max_iter=700, penalty=none, C=1e10) discrimination test: 0.08510638297872342
delta: 0.5915602836879432
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------




----------------------------------------
val: LR(solver=saga, fit_intercept=False, max_iter=1400, penalty=none, C=1e10): 0.6421428571428571
test: LR(solver=saga, fit_intercept=False, max_iter=1400, penalty=none, C=1e10): 0.68
LR(solver=saga, fit_intercept=False, max_iter=1400, penalty=none, C=1e10) discrimination val: 0.37860666095960216
LR(solver=saga, fit_intercept=False, max_iter=1400, penalty=none, C=1e10) discrimination test: 0.08156028368794332
delta: 0.5984397163120567
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------




----------------------------------------
val: LR(solver=saga, fit_intercept=False, max_iter=2100, penalty=none, C=1e10): 0.6423333333333333
test: LR(solver=saga, fit_intercept=False, max_iter=2100, penalty=none, C=1e10): 0.68
LR(solver=saga, fit_intercept=False, max_iter=2100, penalty=none, C=1e10) discrimination val: 0.37488293723587846
LR(solver=saga, fit_intercept=False, max_iter=2100, penalty=none, C=1e10) discrimination test: 0.08156028368794332
delta: 0.5984397163120567
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
----------------------------------------
val: Custom Logistic Regression(lr=0.1, max_itr=100): 0.640625
test: Custom Logistic Regression(lr=0.1, max_itr=100): 0.6833333333333333
Custom Logistic Regression(lr=0.1, max_itr=100) discrimination val: 0.3636100750071338
Custom Logistic Regression(lr=0.1, max_itr=100) disc



# The most similar model

In [None]:
models = []
names = ['CLR(0.01, 100)',
]

models.append(CLR(0.01, 10000))

In [None]:
runAdultWithSplitting(X_adult_expand, y_adult, X_adult_expand_test, y_adult_test, 'sex')

In [None]:
runGermanWithSplitting(X_german_all_expand, y_german, 12);