In [1]:
from helpers import *

import numpy as np
import pandas as pd
import math
import sys
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import model_selection
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
import sklearn.discriminant_analysis as DA
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, ComplementNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC

# Compute discrimination

In [2]:
def computeDiscrimination(X, prediction, sensitiveAttr):
#     X = X.assign(prediction = prediction)
    X = np.append(X, prediction.reshape(-1, 1), axis=1)
    protectedGroup = X[X[:, sensitiveAttr] == 1]
    unprotectedGroup = X[X[:, sensitiveAttr] == 0]
    protectedCount = np.count_nonzero(protectedGroup[:, sensitiveAttr] == 1)
    unprotectedCount = X.shape[0] - protectedCount
    proportionOfProtected = protectedGroup[:, -1].sum() / protectedCount
    proportionOfUnprotected = unprotectedGroup[:,-1].sum() /unprotectedCount
    discrim = abs(proportionOfProtected - proportionOfUnprotected)
    return discrim
    

# Run Helper

In [126]:
# used when improving the baseline
def run(X, y, X_test, y_test, SA):
#     kfold = model_selection.KFold(n_splits=5, random_state=7)
    kfold = model_selection.StratifiedKFold(n_splits=5, random_state=42)
    for i, model in enumerate(models):
        cv_result = model_selection.cross_val_score(model, X, y, cv=kfold, scoring='accuracy')

        model.fit(X, y)
        prediction_test = model.predict(X_test)

        acc_score_val = np.mean(cv_result)
        acc_score_test = accuracy_score(y_test, prediction_test)
#         discr_score = computeDiscrimination(X_test, prediction_test, SA)
        print ('-'*40)
        print ('val: {0}: {1}'.format(names[i], acc_score_val))
        print ('test: {0}: {1}'.format(names[i], acc_score_test))
#         print('{0} discrimination: {1}'.format(names[i], discr_score))

# Runner with the data splitting according to the report

In [155]:
# used for reproducing the baseline
def runAdultWithSplitting(X, y, X_test, y_test, SA, seed):
    val_acc = []
    val_discr = []
    X_splits = np.array_split(X, 5)
    y_splits = np.array_split(y, 5)
    for i, model in enumerate(models):
        valStartPos = 0
        valEndPos = 0
        for x_split, y_split in zip(X_splits, y_splits):
            # shuffle the split
            x_split, y_split = shuffle(x_split, y_split, random_state=seed)
            valEndPos += math.floor((1/3) * x_split.shape[0]) # 1/3 of 1 split set
            X_val = x_split[valStartPos:valEndPos] # 1/3 of 1 split set
            y_val = y_split[valStartPos:valEndPos]
            # the remaining 2/3 of the split
            X_train = np.array(x_split[:valStartPos])
            X_train = np.append(X_train, x_split[valEndPos:], axis=0)
            y_train = np.array(y_split[:valStartPos])
            y_train = np.append(y_train, y_split[valEndPos:], axis=0)
            model.fit(X_train, y_train)
            prediction_val = model.predict(X_val)
            acc_score_val = np.mean(y_val == prediction_val)
            val_acc.append(acc_score_val)
            discr_score_val = computeDiscrimination(X_val, prediction_val, SA)
            val_discr.append(discr_score_val)
            # slide the start position of the validation set            
            valStartPos += math.floor(0.2 * x_split.shape[0]) # 20% of 1 split set
            # check if valStartPos within the boundaries of the split
            if (valStartPos > x_split.shape[0]):
                valStartPos = 0
        # calculate the mean of accuracy and discrimination based on validation dataset
        val_acc_score = np.mean(val_acc)
        val_discr_acc_score = np.mean(val_discr)
        val_delta_score = val_acc_score - val_discr_acc_score
        
        prediction_test = model.predict(X_test)
        acc_score_test = accuracy_score(y_test, prediction_test)
        discr_score_test = computeDiscrimination(X_test, prediction_test, SA)
        delta = abs(acc_score_test - discr_score_test)
        print ('-'*40)
        print ('val: {0}: {1}'.format(names[i], val_acc_score))
        print ('test: {0}: {1}'.format(names[i], acc_score_test))
        print('{0} discrimination val: {1}'.format(names[i], val_discr_acc_score))
        print('val-delta: {0}'.format(val_delta_score))
        print('{0} discrimination test: {1}'.format(names[i], discr_score_test))
        print('test-delta: {0}'.format(delta))
        print ('-'*100)
        print ('-'*100)

In [146]:
# used for reproducing the baseline
def runGermanWithSplitting(X, y, SA, seed): # SA - sensitive attribute
    X_test = np.empty_like(X[:0])
    y_test = np.empty_like(y[:0])
    val_acc = []
    val_discr = []
    X_splits = np.array_split(X, 5)
    y_splits = np.array_split(y, 5)
    for i, model in enumerate(models):
        valStartPos = 0
        valEndPos = 0
        for x_split, y_split in zip(X_splits, y_splits):
            x_split, y_split = shuffle(x_split, y_split, random_state=seed)
            valEndPos += math.floor(0.2 * x_split.shape[0]) # 20% of 1 split set
            train_amount = math.floor(0.5 * x_split.shape[0]) # 50% of 1 split set (round)
            X_val = x_split[valStartPos:valEndPos] # 20% of the split
            y_val = y_split[valStartPos:valEndPos]
            # the remaining 80% of the split
            remaining_X_subset = np.array(x_split[:valStartPos])
            remaining_X_subset = np.append(remaining_X_subset, x_split[valEndPos:], axis=0)
            remaining_y_subset = np.array(y_split[:valStartPos])
            remaining_y_subset = np.append(remaining_y_subset, y_split[valEndPos:], axis=0)
            X_train = remaining_X_subset[0:train_amount] # 50% of 1 split set (test set)
            y_train = remaining_y_subset[0:train_amount]
            X_test = np.append(X_test, remaining_X_subset[train_amount:], axis=0)
            y_test = np.append(y_test, remaining_y_subset[train_amount:], axis=0)
            model.fit(X_train, y_train)        
            prediction_val = model.predict(X_val)
            acc_score_val = np.mean(y_val == prediction_val)
            val_acc.append(acc_score_val)
            discr_score_val = computeDiscrimination(X_val, prediction_val, SA)
            val_discr.append(discr_score_val)
            # slide the start position of the validation set            
            valStartPos += math.floor(0.2 * x_split.shape[0]) # 20% of 1 split set
        # calculate the mean of accuracy and discrimination based on validation dataset    
        val_acc_score = np.mean(val_acc)
        val_discr_acc_score = np.mean(val_discr)
        val_delta_score = val_acc_score - val_discr_acc_score
        
        prediction_test = model.predict(X_test)
        acc_score_test = accuracy_score(y_test, prediction_test)
        discr_score_test = computeDiscrimination(X_test, prediction_test, SA)
        delta = abs(acc_score_test - discr_score_test)
        print ('-'*40)
        print ('val: {0}: {1}'.format(names[i], val_acc_score))
        print ('test: {0}: {1}'.format(names[i], acc_score_test))
        print('{0} discrimination val: {1}'.format(names[i], val_discr_acc_score))
        print('val-delta: {0}'.format(val_delta_score))
        print('{0} discrimination test: {1}'.format(names[i], discr_score_test))
        print('test-delta: {0}'.format(delta))
        print ('-'*100)
        print ('-'*100)

# Models to Run

In [153]:
models = []
names = ['LR(solver=lbfgs, fit_intercept=False, max_iter=700, penalty=none, C=1e10)',
         'LR(solver=lbfgs, fit_intercept=False, max_iter=1400, penalty=none, C=1e10)',
         'LR(solver=lbfgs, fit_intercept=False, max_iter=2100, penalty=none, C=1e10)',
         'LR(solver=newton-cg, fit_intercept=False, max_iter=100, penalty=none, C=1e10)',
         'LR(solver=newton-cg, fit_intercept=False, max_iter=500, penalty=none, C=1e10)',
         'LR(solver=newton-cg, fit_intercept=False, max_iter=1500, penalty=none, C=1e10)',
         'LR(solver=liblinear, fit_intercept=False, max_iter=500, C=1e10)',
         'LR(solver=liblinear, fit_intercept=False, max_iter=1000, C=1e10)',
         'LR(solver=liblinear, fit_intercept=False, max_iter=1500, C=1e10)',
         'LR(solver=sag, fit_intercept=False, max_iter=1000, penalty=none, C=1e10)',
         'LR(solver=sag, fit_intercept=False, max_iter=3000, penalty=none, C=1e10)',
         'LR(solver=sag, fit_intercept=False, max_iter=5000, penalty=none, C=1e10)',
         'LR(solver=saga, fit_intercept=False, max_iter=700, penalty=none, C=1e10)',
         'LR(solver=saga, fit_intercept=False, max_iter=1400, penalty=none, C=1e10)',
         'LR(solver=saga, fit_intercept=False, max_iter=2100, penalty=none, C=1e10)',
#          'Linear Discriminant Analysis',
#          'Quadratic Discriminant Analysis',
#          'Random Forest',
#          'Neural Network',
#          'Gaussian NB',
#          'Bernoulli NB',
#          'Complement NB',
#          'Multinomial NB',
#          'Random Forest',
#          'K Neighbors Classifier',
#          'SVM',
#          'LinearSVC'
]

models.append(LogisticRegression(solver='lbfgs', fit_intercept=False, max_iter=700, penalty='none'))
models.append(LogisticRegression(solver='lbfgs', fit_intercept=False, max_iter=1400, penalty='none'))
models.append(LogisticRegression(solver='lbfgs', fit_intercept=False, max_iter=2100, penalty='none'))
models.append(LogisticRegression(solver='newton-cg', fit_intercept=False, max_iter=100, penalty='none'))
models.append(LogisticRegression(solver='newton-cg', fit_intercept=False, max_iter=500, penalty='none'))
models.append(LogisticRegression(solver='newton-cg', fit_intercept=False, max_iter=1500, penalty='none'))
models.append(LogisticRegression(solver='liblinear', fit_intercept=False, max_iter=500, C=1e10))
models.append(LogisticRegression(solver='liblinear', fit_intercept=False, max_iter=1000, C=1e10))
models.append(LogisticRegression(solver='liblinear', fit_intercept=False, max_iter=1500, C=1e10))
models.append(LogisticRegression(solver='sag', fit_intercept=False, max_iter=1000, penalty='none'))
models.append(LogisticRegression(solver='sag', fit_intercept=False, max_iter=3000, penalty='none'))
models.append(LogisticRegression(solver='sag', fit_intercept=False, max_iter=5000, penalty='none'))
models.append(LogisticRegression(solver='saga', fit_intercept=False, max_iter=700, penalty='none'))
models.append(LogisticRegression(solver='saga', fit_intercept=False, max_iter=1400, penalty='none'))
models.append(LogisticRegression(solver='saga', fit_intercept=False, max_iter=2100, penalty='none'))
# models.append(DA.LinearDiscriminantAnalysis())
# models.append(DA.QuadraticDiscriminantAnalysis())
# models.append(RandomForestClassifier(n_estimators=100))
# models.append(MLPClassifier())
# models.append(GaussianNB())
# models.append(BernoulliNB())
# models.append(ComplementNB())
# models.append(MultinomialNB())
# models.append(DecisionTreeClassifier())
# models.append(KNeighborsClassifier(n_neighbors=50))
# models.append(SVC())
# models.append(LinearSVC())

# Adult Dataset

In [156]:
df_adult, pct = load_adult('datasets/adult/adult.data')
X_adult = df_adult.iloc[:, :-1]
y_adult = df_adult.iloc[:, -1]
print('percentage of corrupt rows: {0:.1f}%'.format((1-pct)*100))

df_adult_test, pct = load_adult('datasets/adult/adult.test')
X_adult_test = df_adult_test.iloc[:, :-1]
y_adult_test = df_adult_test.iloc[:, -1]
print('percentage of corrupt rows in testing: {0:.1f}%'.format((1-pct)*100))

percentage of corrupt rows: 7.4%
percentage of corrupt rows in testing: 7.5%


# expanding

In [157]:
# used for method 2 of improving
# X_adult['sex'] = X_adult['sex'].map({'Female': 1, 'Male': 0}).astype(int)
# X_adult_test['sex'] = X_adult_test['sex'].map({'Female': 1, 'Male': 0}).astype(int)

# used for everything else
X_adult, X_adult_test = encode_adult(X_adult, X_adult_test)

X_adult_all = X_adult.append(X_adult_test)
X_adult_all_expand = pd.get_dummies(X_adult_all)

# X_expand, X_expand_test = encode_adult(X_expand, X_expand_test)
X_adult_expand = X_adult_all_expand[0:X_adult.shape[0]]
X_adult_expand_test = X_adult_all_expand[X_adult.shape[0]:]

In [158]:
# improving
# run(X_expand, y_adult, X_expand_test, y_adult_test, 'sex')

# baseline
runAdultWithSplitting(X_adult_expand, y_adult, X_adult_expand_test, y_adult_test, 3, 42) #index 3 is Sex attribute

----------------------------------------
val: LR(solver=lbfgs, fit_intercept=False, max_iter=700, penalty=none, C=1e10): 0.8308163464247915
test: LR(solver=lbfgs, fit_intercept=False, max_iter=700, penalty=none, C=1e10): 0.8314741035856573
LR(solver=lbfgs, fit_intercept=False, max_iter=700, penalty=none, C=1e10) discrimination val: 0.18961826980029373
val-delta: 0.6411980766244978
LR(solver=lbfgs, fit_intercept=False, max_iter=700, penalty=none, C=1e10) discrimination test: 0.21392728198153538
test-delta: 0.617546821604122
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
----------------------------------------
val: LR(solver=lbfgs, fit_intercept=False, max_iter=1400, penalty=none, C=1e10): 0.8308163464247915
test: LR(solver=lbfgs, fit_intercept=False, max_iter=1400, penalty=none, C=1e10): 0.8314741035856573
LR(solver=lbfgs, fit_interc



----------------------------------------
val: LR(solver=sag, fit_intercept=False, max_iter=1000, penalty=none, C=1e10): 0.8308959088315541
test: LR(solver=sag, fit_intercept=False, max_iter=1000, penalty=none, C=1e10): 0.8315405046480744
LR(solver=sag, fit_intercept=False, max_iter=1000, penalty=none, C=1e10) discrimination val: 0.18980286513171166
val-delta: 0.6410930436998424
LR(solver=sag, fit_intercept=False, max_iter=1000, penalty=none, C=1e10) discrimination test: 0.21423581393411018
test-delta: 0.6173046907139642
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
----------------------------------------
val: LR(solver=sag, fit_intercept=False, max_iter=3000, penalty=none, C=1e10): 0.8309067582506581
test: LR(solver=sag, fit_intercept=False, max_iter=3000, penalty=none, C=1e10): 0.8314741035856573
LR(solver=sag, fit_intercept=False



----------------------------------------
val: LR(solver=saga, fit_intercept=False, max_iter=700, penalty=none, C=1e10): 0.8309157994332449
test: LR(solver=saga, fit_intercept=False, max_iter=700, penalty=none, C=1e10): 0.8316069057104913
LR(solver=saga, fit_intercept=False, max_iter=700, penalty=none, C=1e10) discrimination val: 0.18987791987199804
val-delta: 0.6410378795612468
LR(solver=saga, fit_intercept=False, max_iter=700, penalty=none, C=1e10) discrimination test: 0.21413726263816063
test-delta: 0.6174696430723308
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------




----------------------------------------
val: LR(solver=saga, fit_intercept=False, max_iter=1400, penalty=none, C=1e10): 0.830922903219563
test: LR(solver=saga, fit_intercept=False, max_iter=1400, penalty=none, C=1e10): 0.8315405046480744
LR(solver=saga, fit_intercept=False, max_iter=1400, penalty=none, C=1e10) discrimination val: 0.18989233689034377
val-delta: 0.6410305663292192
LR(solver=saga, fit_intercept=False, max_iter=1400, penalty=none, C=1e10) discrimination test: 0.21423581393411018
test-delta: 0.6173046907139642
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
----------------------------------------
val: LR(solver=saga, fit_intercept=False, max_iter=2100, penalty=none, C=1e10): 0.8309290598343722
test: LR(solver=saga, fit_intercept=False, max_iter=2100, penalty=none, C=1e10): 0.8315405046480744
LR(solver=saga, fit_intercept

  return 1.0 / (1 + np.exp(-a))


----------------------------------------
val: Custom Logistic Regression(lr=0.1, max_itr=100): 0.8258589037511962
test: Custom Logistic Regression(lr=0.1, max_itr=100): 0.7303452855245683
Custom Logistic Regression(lr=0.1, max_itr=100) discrimination val: 0.1945600353436407
val-delta: 0.6312988684075556
Custom Logistic Regression(lr=0.1, max_itr=100) discrimination test: 0.36967277138420207
test-delta: 0.3606725141403663
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
----------------------------------------
val: Custom Logistic Regression(lr=0.1, max_itr=500): 0.8215913452036552
test: Custom Logistic Regression(lr=0.1, max_itr=500): 0.7178618857901726
Custom Logistic Regression(lr=0.1, max_itr=500) discrimination val: 0.20152099861531245
val-delta: 0.6200703465883427
Custom Logistic Regression(lr=0.1, max_itr=500) discrimination test

# German dataset

In [159]:
# load german dataset
df_german = load_german('datasets/german/german.data')
X_german = df_german.iloc[:, :-1]
y_german = df_german.iloc[:, -1]

# One hot encoder
X_german_encoded = encode_german_all(X_german)
X_german_all_expand = pd.get_dummies(X_german_encoded)

In [161]:
# improving
# run(X_train, y_train, X_test, y_test, 'Age')

# baseline
runGermanWithSplitting(X_german_all_expand, y_german, 12, 0) #index 12 is Age attribute

----------------------------------------
val: LR(solver=lbfgs, fit_intercept=False, max_iter=700, penalty=none, C=1e10): 0.675
test: LR(solver=lbfgs, fit_intercept=False, max_iter=700, penalty=none, C=1e10): 0.68
LR(solver=lbfgs, fit_intercept=False, max_iter=700, penalty=none, C=1e10) discrimination val: 0.3488076311605724
val-delta: 0.32619236883942765
LR(solver=lbfgs, fit_intercept=False, max_iter=700, penalty=none, C=1e10) discrimination test: 0.03664302600472813
test-delta: 0.6433569739952719
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
----------------------------------------
val: LR(solver=lbfgs, fit_intercept=False, max_iter=1400, penalty=none, C=1e10): 0.675
test: LR(solver=lbfgs, fit_intercept=False, max_iter=1400, penalty=none, C=1e10): 0.68
LR(solver=lbfgs, fit_intercept=False, max_iter=1400, penalty=none, C=1e10) discr



----------------------------------------
val: LR(solver=sag, fit_intercept=False, max_iter=1000, penalty=none, C=1e10): 0.65
test: LR(solver=sag, fit_intercept=False, max_iter=1000, penalty=none, C=1e10): 0.69
LR(solver=sag, fit_intercept=False, max_iter=1000, penalty=none, C=1e10) discrimination val: 0.33810770046064165
val-delta: 0.3118922995393584
LR(solver=sag, fit_intercept=False, max_iter=1000, penalty=none, C=1e10) discrimination test: 0.04018912529550833
test-delta: 0.6498108747044916
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------




----------------------------------------
val: LR(solver=sag, fit_intercept=False, max_iter=3000, penalty=none, C=1e10): 0.6513636363636363
test: LR(solver=sag, fit_intercept=False, max_iter=3000, penalty=none, C=1e10): 0.69
LR(solver=sag, fit_intercept=False, max_iter=3000, penalty=none, C=1e10) discrimination val: 0.3376692200221612
val-delta: 0.31369441634147505
LR(solver=sag, fit_intercept=False, max_iter=3000, penalty=none, C=1e10) discrimination test: 0.04018912529550833
test-delta: 0.6498108747044916
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------




----------------------------------------
val: LR(solver=sag, fit_intercept=False, max_iter=5000, penalty=none, C=1e10): 0.6525
test: LR(solver=sag, fit_intercept=False, max_iter=5000, penalty=none, C=1e10): 0.69
LR(solver=sag, fit_intercept=False, max_iter=5000, penalty=none, C=1e10) discrimination val: 0.3373038196567609
val-delta: 0.3151961803432391
LR(solver=sag, fit_intercept=False, max_iter=5000, penalty=none, C=1e10) discrimination test: 0.04018912529550833
test-delta: 0.6498108747044916
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------




----------------------------------------
val: LR(solver=saga, fit_intercept=False, max_iter=700, penalty=none, C=1e10): 0.651923076923077
test: LR(solver=saga, fit_intercept=False, max_iter=700, penalty=none, C=1e10): 0.6866666666666666
LR(solver=saga, fit_intercept=False, max_iter=700, penalty=none, C=1e10) discrimination val: 0.3362056800970828
val-delta: 0.31571739682599426
LR(solver=saga, fit_intercept=False, max_iter=700, penalty=none, C=1e10) discrimination test: 0.03664302600472813
test-delta: 0.6500236406619385
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------




----------------------------------------
val: LR(solver=saga, fit_intercept=False, max_iter=1400, penalty=none, C=1e10): 0.6521428571428572
test: LR(solver=saga, fit_intercept=False, max_iter=1400, penalty=none, C=1e10): 0.69
LR(solver=saga, fit_intercept=False, max_iter=1400, penalty=none, C=1e10) discrimination val: 0.3352644176173588
val-delta: 0.31687843952549843
LR(solver=saga, fit_intercept=False, max_iter=1400, penalty=none, C=1e10) discrimination test: 0.04018912529550833
test-delta: 0.6498108747044916
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------




----------------------------------------
val: LR(solver=saga, fit_intercept=False, max_iter=2100, penalty=none, C=1e10): 0.653
test: LR(solver=saga, fit_intercept=False, max_iter=2100, penalty=none, C=1e10): 0.69
LR(solver=saga, fit_intercept=False, max_iter=2100, penalty=none, C=1e10) discrimination val: 0.3351324174853587
val-delta: 0.3178675825146413
LR(solver=saga, fit_intercept=False, max_iter=2100, penalty=none, C=1e10) discrimination test: 0.04018912529550833
test-delta: 0.6498108747044916
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
----------------------------------------
val: Custom Logistic Regression(lr=0.1, max_itr=100): 0.6546875
test: Custom Logistic Regression(lr=0.1, max_itr=100): 0.6966666666666667
Custom Logistic Regression(lr=0.1, max_itr=100) discrimination val: 0.33736700501406386
val-delta: 0.3173204949859361

# The most similar model

In [103]:
models = []
names = ['CLR(solver=liblinear, max_iter=500, C=1e40, penalty=l1)'
]
# sys.float_info.max
models.append(LogisticRegression(solver='liblinear', max_iter=500, C=1e40, penalty='l1'))

In [104]:
runAdultWithSplitting(X_adult_expand, y_adult, X_adult_expand_test, y_adult_test, 3, 17) #index 3 is Sex attribute

----------------------------------------
val: CLR(0.01, 100): 0.8471829851584929
test: CLR(0.01, 100): 0.8474103585657371
CLR(0.01, 100) discrimination val: 0.17561248648945355
val-delta: 0.6715704986690394
CLR(0.01, 100) discrimination test: 0.1937732711594276
test-delta: 0.6536370874063095
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------


In [73]:
# minizing discrimination and maximizing delta
for seed in range(41):
    runAdultWithSplitting(X_adult_expand, y_adult, X_adult_expand_test, y_adult_test, 3, seed) #index 3 is Sex attribute

----------------------------------------
val: CLR(0.01, 100): 0.828270566367026
test: CLR(0.01, 100): 0.8322709163346613
CLR(0.01, 100) discrimination val: 0.20268083284904606
val-delta: 0.62558973351798
CLR(0.01, 100) discrimination test: 0.21313887161393902
test-delta: 0.6191320447207223
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
----------------------------------------
val: CLR(0.01, 100): 0.8296634789723083
test: CLR(0.01, 100): 0.8316733067729084
CLR(0.01, 100) discrimination val: 0.1887886982376536
val-delta: 0.6408747807346546
CLR(0.01, 100) discrimination test: 0.21795304124023707
test-delta: 0.6137202655326713
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
------------

----------------------------------------
val: CLR(0.01, 100): 0.8335676568724036
test: CLR(0.01, 100): 0.8331341301460823
CLR(0.01, 100) discrimination val: 0.18390783770751695
val-delta: 0.6496598191648867
CLR(0.01, 100) discrimination test: 0.1959567851464
test-delta: 0.6371773449996823
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
----------------------------------------
val: CLR(0.01, 100): 0.8261818932856485
test: CLR(0.01, 100): 0.8316069057104913
CLR(0.01, 100) discrimination val: 0.19135922809739675
val-delta: 0.6348226651882518
CLR(0.01, 100) discrimination test: 0.19662481970960125
test-delta: 0.6349820860008901
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
------------

----------------------------------------
val: CLR(0.01, 100): 0.8319967867143351
test: CLR(0.01, 100): 0.8319389110225763
CLR(0.01, 100) discrimination val: 0.1827637965016449
val-delta: 0.6492329902126902
CLR(0.01, 100) discrimination test: 0.20780869678979735
test-delta: 0.6241302142327789
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
----------------------------------------
val: CLR(0.01, 100): 0.8304451150368124
test: CLR(0.01, 100): 0.8320053120849934
CLR(0.01, 100) discrimination val: 0.20095915036729473
val-delta: 0.6294859646695177
CLR(0.01, 100) discrimination test: 0.21389956004157967
test-delta: 0.6181057520434137
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
---------

In [74]:
# minizing discrimination and maximizing delta
for seed in range(41):
    runGermanWithSplitting(X_german_all_expand, y_german, 12, seed) #index 12 is Age attribute

----------------------------------------
val: CLR(0.01, 100): 0.625
test: CLR(0.01, 100): 0.69
CLR(0.01, 100) discrimination val: nan
val-delta: nan
CLR(0.01, 100) discrimination test: 0.12907919351486175
test-delta: 0.5609208064851382
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
----------------------------------------
val: CLR(0.01, 100): 0.7449999999999999
test: CLR(0.01, 100): 0.5866666666666667
CLR(0.01, 100) discrimination val: nan
val-delta: nan
CLR(0.01, 100) discrimination test: 0.3017543859649123
test-delta: 0.2849122807017544
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
----------------------------------------
val: CLR(0.01, 100): 0.655
test: CLR(0.01, 100): 0.653333

  
  
  
  
  


----------------------------------------
val: CLR(0.01, 100): 0.655
test: CLR(0.01, 100): 0.6866666666666666
CLR(0.01, 100) discrimination val: nan
val-delta: nan
CLR(0.01, 100) discrimination test: 0.09140859140859137
test-delta: 0.5952580752580753
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
----------------------------------------
val: CLR(0.01, 100): 0.64
test: CLR(0.01, 100): 0.6633333333333333
CLR(0.01, 100) discrimination val: 0.6358974358974357
val-delta: 0.004102564102564266
CLR(0.01, 100) discrimination test: 0.04528985507246375
test-delta: 0.6180434782608696
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
----------------------------------------
val: CLR(0.01, 100): 0.6

  
  


----------------------------------------
val: CLR(0.01, 100): 0.6649999999999999
test: CLR(0.01, 100): 0.6633333333333333
CLR(0.01, 100) discrimination val: nan
val-delta: nan
CLR(0.01, 100) discrimination test: 0.18221830985915488
test-delta: 0.48111502347417845
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
----------------------------------------
val: CLR(0.01, 100): 0.67
test: CLR(0.01, 100): 0.6133333333333333
CLR(0.01, 100) discrimination val: 0.25499582289055966
val-delta: 0.4150041771094404
CLR(0.01, 100) discrimination test: 0.4513172966781215
test-delta: 0.1620160366552118
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
----------------------------------------
val: CLR(0.0

  


----------------------------------------
val: CLR(0.01, 100): 0.675
test: CLR(0.01, 100): 0.6266666666666667
CLR(0.01, 100) discrimination val: nan
val-delta: nan
CLR(0.01, 100) discrimination test: 0.3838028169014085
test-delta: 0.2428638497652582
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------




----------------------------------------
val: CLR(0.01, 100): 0.6250000000000001
test: CLR(0.01, 100): 0.71
CLR(0.01, 100) discrimination val: 0.44666666666666666
val-delta: 0.17833333333333345
CLR(0.01, 100) discrimination test: 0.19791666666666663
test-delta: 0.5120833333333333
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------


  
  
  


----------------------------------------
val: CLR(0.01, 100): 0.735
test: CLR(0.01, 100): 0.5866666666666667
CLR(0.01, 100) discrimination val: 0.44328336433599597
val-delta: 0.291716635664004
CLR(0.01, 100) discrimination test: 0.3007234979553318
test-delta: 0.28594316871133485
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
----------------------------------------
val: CLR(0.01, 100): 0.635
test: CLR(0.01, 100): 0.64
CLR(0.01, 100) discrimination val: nan
val-delta: nan
CLR(0.01, 100) discrimination test: 0.007922535211267623
test-delta: 0.6320774647887324
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
----------------------------------------
val: CLR(0.01, 100): 0.619999999999999



----------------------------------------
val: CLR(0.01, 100): 0.6649999999999999
test: CLR(0.01, 100): 0.6
CLR(0.01, 100) discrimination val: 0.37709085603822445
val-delta: 0.28790914396177547
CLR(0.01, 100) discrimination test: 0.1473684210526316
test-delta: 0.4526315789473684
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
----------------------------------------
val: CLR(0.01, 100): 0.635
test: CLR(0.01, 100): 0.6133333333333333
CLR(0.01, 100) discrimination val: 0.4422402159244264
val-delta: 0.1927597840755736
CLR(0.01, 100) discrimination test: 0.2390779951755561
test-delta: 0.3742553381577772
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------




----------------------------------------
val: CLR(0.01, 100): 0.635
test: CLR(0.01, 100): 0.6566666666666666
CLR(0.01, 100) discrimination val: 0.5327935222672064
val-delta: 0.10220647773279357
CLR(0.01, 100) discrimination test: 0.5559093463195355
test-delta: 0.10075732034713114
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
----------------------------------------
val: CLR(0.01, 100): 0.6299999999999999
test: CLR(0.01, 100): 0.6666666666666666
CLR(0.01, 100) discrimination val: 0.4601524601524602
val-delta: 0.1698475398475397
CLR(0.01, 100) discrimination test: 0.24675324675324684
test-delta: 0.4199134199134198
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------


  


----------------------------------------
val: CLR(0.01, 100): 0.65
test: CLR(0.01, 100): 0.6133333333333333
CLR(0.01, 100) discrimination val: 0.4110661268556005
val-delta: 0.2389338731443995
CLR(0.01, 100) discrimination test: 0.03157894736842104
test-delta: 0.5817543859649122
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
----------------------------------------
val: CLR(0.01, 100): 0.675
test: CLR(0.01, 100): 0.6766666666666666
CLR(0.01, 100) discrimination val: 0.414327485380117
val-delta: 0.26067251461988306
CLR(0.01, 100) discrimination test: 0.08771929824561409
test-delta: 0.5889473684210526
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
-------------------------------------

  
  
  


----------------------------------------
val: CLR(0.01, 100): 0.625
test: CLR(0.01, 100): 0.64
CLR(0.01, 100) discrimination val: nan
val-delta: nan
CLR(0.01, 100) discrimination test: 0.25087108013937287
test-delta: 0.38912891986062714
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
----------------------------------------
val: CLR(0.01, 100): 0.6649999999999999
test: CLR(0.01, 100): 0.66
CLR(0.01, 100) discrimination val: nan
val-delta: nan
CLR(0.01, 100) discrimination test: 0.18496382510223341
test-delta: 0.4750361748977666
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
----------------------------------------
val: CLR(0.01, 100): 0.67
test: CLR(0.01, 100): 0.6566666666666666
CL

  


----------------------------------------
val: CLR(0.01, 100): 0.655
test: CLR(0.01, 100): 0.6066666666666667
CLR(0.01, 100) discrimination val: nan
val-delta: nan
CLR(0.01, 100) discrimination test: 0.512426422498365
test-delta: 0.09424024416830168
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
----------------------------------------
val: CLR(0.01, 100): 0.62
test: CLR(0.01, 100): 0.6166666666666667
CLR(0.01, 100) discrimination val: 0.29103536735115687
val-delta: 0.3289646326488431
CLR(0.01, 100) discrimination test: 0.2878819372271877
test-delta: 0.328784729439479
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------




----------------------------------------
val: CLR(0.01, 100): 0.655
test: CLR(0.01, 100): 0.6566666666666666
CLR(0.01, 100) discrimination val: 0.516661195608564
val-delta: 0.13833880439143598
CLR(0.01, 100) discrimination test: 0.1733266733266734
test-delta: 0.4833399933399932
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
----------------------------------------
val: CLR(0.01, 100): 0.655
test: CLR(0.01, 100): 0.5866666666666667
CLR(0.01, 100) discrimination val: 0.4417678812415654
val-delta: 0.21323211875843462
CLR(0.01, 100) discrimination test: 0.3892857142857143
test-delta: 0.19738095238095238
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
------------------------------------

  
  
  


# Improving the results

In [140]:
models = []
names = ['LR(solver=liblinear, max_iter=500, C=1e40, penalty=l1)',
         'LR(solver=lbfgs)',
         'LR(solver=newton-cg)',
         'LR(solver=liblinear)',
         'LR(solver=sag)',
         'LR(solver=saga)',
         'Linear Discriminant Analysis',
         'Quadratic Discriminant Analysis',
         'Random Forest',
         'Neural Network',
         'Gaussian NB',
         'Bernoulli NB',
         'Complement NB',
         'Multinomial NB',
         'DecisionTreeClassifier',
         'K Neighbors Classifier',
         'SVM',
         'LinearSVC'
]

models.append(LogisticRegression(solver='liblinear', max_iter=500, C=1e40, penalty='l1'))
models.append(LogisticRegression(solver='lbfgs'))
models.append(LogisticRegression(solver='newton-cg'))
models.append(LogisticRegression(solver='liblinear'))
models.append(LogisticRegression(solver='sag'))
models.append(LogisticRegression(solver='saga'))
models.append(DA.LinearDiscriminantAnalysis())
models.append(DA.QuadraticDiscriminantAnalysis())
models.append(RandomForestClassifier(n_estimators=100))
models.append(MLPClassifier())
models.append(GaussianNB())
models.append(BernoulliNB())
models.append(ComplementNB())
models.append(MultinomialNB())
models.append(DecisionTreeClassifier())
models.append(KNeighborsClassifier(n_neighbors=50))
models.append(SVC())
models.append(LinearSVC())

In [141]:
# df_adult, pct = load_adult('datasets/adult/adult.data')
# X_adult = df_adult.iloc[:, :-1]
# X_adult['sex'] = X_adult['sex'].map({'Female': 1, 'Male': 0}).astype(int)
# y_adult = df_adult.iloc[:, -1]
# print('percentage of corrupt rows: {0:.1f}%'.format((1-pct)*100))

# df_adult_test, pct = load_adult('datasets/adult/adult.test')
# X_adult_test = df_adult_test.iloc[:, :-1]
# X_adult_test['sex'] = X_adult_test['sex'].map({'Female': 1, 'Male': 0}).astype(int)
# y_adult_test = df_adult_test.iloc[:, -1]
# print('percentage of corrupt rows in testing: {0:.1f}%'.format((1-pct)*100))

# X_adult_all = X_adult.append(X_adult_test)
# X_adult_all_expand = pd.get_dummies(X_adult_all)

# # X_expand, X_expand_test = encode_adult(X_expand, X_expand_test)
# X_adult_expand = X_adult_all_expand[0:X_adult.shape[0]]
# X_adult_expand_test = X_adult_all_expand[X_adult.shape[0]:]



df_adult, pct = load_adult('datasets/adult/adult.data')
X_adult = df_adult.iloc[:, :-1]
y_adult = df_adult.iloc[:, -1]
print('percentage of corrupt rows: {0:.1f}%'.format((1-pct)*100))

df_adult_test, pct = load_adult('datasets/adult/adult.test')
X_adult_test = df_adult_test.iloc[:, :-1]
y_adult_test = df_adult_test.iloc[:, -1]
print('percentage of corrupt rows in testing: {0:.1f}%'.format((1-pct)*100))

X_adult, X_adult_test = encode_adult(X_adult, X_adult_test)
X_adult_all = X_adult.append(X_adult_test)
X_adult_all_expand = pd.get_dummies(X_adult_all)
X_adult_expand = X_adult_all_expand[0:X_adult.shape[0]]
X_adult_expand_test = X_adult_all_expand[X_adult.shape[0]:]
X_adult_expand = X_adult_expand.drop(['sex'], axis=1)
X_adult_expand_test = X_adult_expand_test.drop(['sex'], axis=1)

percentage of corrupt rows: 7.4%
percentage of corrupt rows in testing: 7.5%


In [142]:
run(X_adult_expand, y_adult, X_adult_expand_test, y_adult_test, 3)

----------------------------------------
val: LR(solver=liblinear, max_iter=500, C=1e40, penalty=l1): 0.8336985201731215
test: LR(solver=liblinear, max_iter=500, C=1e40, penalty=l1): 0.8333333333333334




----------------------------------------
val: LR(solver=lbfgs): 0.8332675571346669
test: LR(solver=lbfgs): 0.8330677290836653
----------------------------------------
val: LR(solver=newton-cg): 0.8337648606533934
test: LR(solver=newton-cg): 0.8333997343957503
----------------------------------------
val: LR(solver=liblinear): 0.8336322346550535
test: LR(solver=liblinear): 0.8334661354581673
----------------------------------------
val: LR(solver=sag): 0.8337648606533934
test: LR(solver=sag): 0.8333997343957503




----------------------------------------
val: LR(solver=saga): 0.8337648606533934
test: LR(solver=saga): 0.8333997343957503




----------------------------------------
val: Linear Discriminant Analysis: 0.8313444142084657
test: Linear Discriminant Analysis: 0.8325365205843294




----------------------------------------
val: Quadratic Discriminant Analysis: 0.4758008015147285
test: Quadratic Discriminant Analysis: 0.6585657370517928
----------------------------------------
val: Random Forest: 0.8185800579087207
test: Random Forest: 0.8183266932270916




----------------------------------------
val: Neural Network: 0.820270742590378
test: Neural Network: 0.8132802124833998
----------------------------------------
val: Gaussian NB: 0.5310659028465877
test: Gaussian NB: 0.5434926958831341
----------------------------------------
val: Bernoulli NB: 0.7895367619128643
test: Bernoulli NB: 0.7899734395750332
----------------------------------------
val: Complement NB: 0.7634771990191259
test: Complement NB: 0.7646746347941568
----------------------------------------
val: Multinomial NB: 0.80734077417791
test: Multinomial NB: 0.8090969455511288
----------------------------------------
val: DecisionTreeClassifier: 0.8055835511815708
test: DecisionTreeClassifier: 0.8061752988047809
----------------------------------------
val: K Neighbors Classifier: 0.8293551946632194
test: K Neighbors Classifier: 0.8274900398406374




----------------------------------------
val: SVM: 0.8273325932290636
test: SVM: 0.8298140770252324
----------------------------------------
val: LinearSVC: 0.8332674252104324
test: LinearSVC: 0.8342629482071713


In [143]:
# load german dataset
df_german = load_german('datasets/german/german.data')
X_german = df_german.iloc[:, :-1]
y_german = df_german.iloc[:, -1]

# One hot encoder
X_german_encoded = encode_german_all(X_german)
X_german_all_expand = pd.get_dummies(X_german_encoded)
X_german_all_expand = X_german_all_expand.drop([12], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X_german_all_expand, y_german, test_size=0.33, random_state=42)

In [144]:
run(X_german_train, y_train, X_test, y_test, 12)

----------------------------------------
val: LR(solver=liblinear, max_iter=500, C=1e40, penalty=l1): 0.713381796115496
test: LR(solver=liblinear, max_iter=500, C=1e40, penalty=l1): 0.7696969696969697
----------------------------------------
val: LR(solver=lbfgs): 0.7238521677327647
test: LR(solver=lbfgs): 0.7606060606060606
----------------------------------------
val: LR(solver=newton-cg): 0.7238521677327647
test: LR(solver=newton-cg): 0.7606060606060606
----------------------------------------
val: LR(solver=liblinear): 0.7253336492142463
test: LR(solver=liblinear): 0.7606060606060606




----------------------------------------
val: LR(solver=sag): 0.7238521677327647
test: LR(solver=sag): 0.7606060606060606




----------------------------------------
val: LR(solver=saga): 0.7238521677327647
test: LR(solver=saga): 0.7606060606060606
----------------------------------------
val: Linear Discriminant Analysis: 0.7103854994035669
test: Linear Discriminant Analysis: 0.7727272727272727
----------------------------------------
val: Quadratic Discriminant Analysis: 0.6060502832537397
test: Quadratic Discriminant Analysis: 0.5212121212121212




----------------------------------------
val: Random Forest: 0.7492036891565564
test: Random Forest: 0.7363636363636363




----------------------------------------
val: Neural Network: 0.7194306662177832
test: Neural Network: 0.7515151515151515
----------------------------------------
val: Gaussian NB: 0.6358139128916819
test: Gaussian NB: 0.696969696969697
----------------------------------------
val: Bernoulli NB: 0.7148632775969774
test: Bernoulli NB: 0.7484848484848485
----------------------------------------
val: Complement NB: 0.6894785886773318
test: Complement NB: 0.703030303030303
----------------------------------------
val: Multinomial NB: 0.711855925053097
test: Multinomial NB: 0.7454545454545455
----------------------------------------
val: DecisionTreeClassifier: 0.6434091031891503
test: DecisionTreeClassifier: 0.6818181818181818
----------------------------------------
val: K Neighbors Classifier: 0.7104302214907084
test: K Neighbors Classifier: 0.706060606060606
----------------------------------------
val: SVM: 0.7104523331546113
test: SVM: 0.7090909090909091




----------------------------------------
val: LinearSVC: 0.713381796115496
test: LinearSVC: 0.7696969696969697
