# Import Libraries

In [68]:
import numpy as np
import pandas as pd
import fairlearn.moments as moments
import fairlearn.classred as red
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Define Regressor Class

In [69]:
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

class LogisticRegressionLearner():
    def __init__(self):
        self.model = None
    def fit(self, X, Y, W):
#         sqrtW = np.sqrt(W)
#         print('sqrtW shape: ', sqrtW.shape)
#         print('sqrtW: ', sqrtW)
#         matX = np.array(X) * sqrtW[:, np.newaxis]
#         vecY = Y * sqrtW
#         print('vecY shape: ', vecY.shape)
#         print('vecY: ', vecY)
        self.model = LogisticRegression(C=0.3, solver='liblinear', tol=1e-3, penalty='l1', verbose=1, max_iter=100000)
        self.model.fit(X, Y, sample_weight=W)

    def predict(self, X):
        return self.model.predict(X)

# Get Files and Inputs

In [70]:
import os
print(os.listdir('../input/fair-classification'))

['test_no_income.csv', 'test_sample0.csv', 'train.csv']


In [71]:
df = pd.read_csv('../input/fair-classification/train.csv')
submission_df = pd.read_csv('../input/fair-classification/test_no_income.csv')

In [72]:
df.shape

(35988, 15)

In [73]:
df.head()

Unnamed: 0,Age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,gender,capital gain,capital loss,hours per week,native-country,income
0,44,Local-gov,100479,Assoc-acdm,12,Married-civ-spouse,Prof-specialty,Husband,0,0,0,0,48,United-States,0
1,57,Private,477867,HS-grad,9,Married-civ-spouse,Transport-moving,Husband,0,0,0,0,40,United-States,0
2,21,Private,410439,Some-college,10,Never-married,Sales,Own-child,0,0,0,0,24,United-States,0
3,21,Private,195532,HS-grad,9,Never-married,Adm-clerical,Unmarried,0,1,0,0,40,United-States,0
4,57,Self-emp-inc,125000,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,0,0,0,0,35,United-States,1


In [74]:
submission_df.shape

(8997, 14)

In [75]:
submission_df.head()

Unnamed: 0,Age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,gender,capital gain,capital loss,hours per week,native-country
0,56,Local-gov,264436,Some-college,10,Divorced,Exec-managerial,Not-in-family,0,1,0,0,40,United-States
1,37,Private,184556,Some-college,10,Divorced,Tech-support,Unmarried,0,1,0,0,40,United-States
2,64,Local-gov,96076,Assoc-voc,11,Married-civ-spouse,Exec-managerial,Husband,0,0,0,0,40,United-States
3,36,Federal-gov,192150,HS-grad,9,Divorced,Adm-clerical,Unmarried,0,1,0,0,40,United-States
4,30,?,147215,Some-college,10,Never-married,?,Own-child,0,1,0,0,30,United-States


In [76]:
# encode string data
df = pd.get_dummies(df)
submission_df = pd.get_dummies(submission_df)

In [77]:
missing_cols = set(df.columns) - set(submission_df.columns)

for c in missing_cols:
    submission_df[c] = 0
    
# Ensure Ordering matches
submission_df = submission_df[df.columns].drop('income', axis=1)

# Organize data into training, testing, features(X), sensitive attributes(A), targets(y)

In [78]:
train_df, test_df = train_test_split(df, test_size=0.25)
X_train, A_train, y_train = train_df.drop(['income','gender'], axis=1), train_df['gender'], train_df['income']
X_test, A_test, y_test = test_df.drop(['income','gender'], axis=1), test_df['gender'], test_df['income']

In [79]:
submission_df.shape[1], X_test.shape[1]

(102, 101)

# Using fairlearn to find best model

In [80]:
learner = LogisticRegressionLearner()
res_tuple = red.expgrad(X_train, A_train, y_train, learner,
                        cons=moments.DP(), eps=0.1)
res_tuple

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]

ExgradResult(best_classifier=<function expgrad.<locals>.<lambda> at 0x000001D077437A60>, best_gap=0.0, classifiers=0     <__main__.LogisticRegressionLearner object at ...
1     <__main__.LogisticRegressionLearner object at ...
2     <__main__.LogisticRegressionLearner object at ...
3     <__main__.LogisticRegressionLearner object at ...
4     <__main__.LogisticRegressionLearner object at ...
5     <__main__.LogisticRegressionLearner object at ...
6     <__main__.LogisticRegressionLearner object at ...
7     <__main__.LogisticRegressionLearner object at ...
8     <__main__.LogisticRegressionLearner object at ...
9     <__main__.LogisticRegressionLearner object at ...
10    <__main__.LogisticRegressionLearner object at ...
11    <__main__.LogisticRegressionLearner object at ...
12    <__main__.LogisticRegressionLearner object at ...
13    <__main__.LogisticRegressionLearner object at ...
14    <__main__.LogisticRegressionLearner object at ...
dtype: object, weights=0     0.000000
1     0

In [81]:
res = res_tuple._asdict()
best_classifier = res['best_classifier']
best_classifier

<function fairlearn.classred.expgrad.<locals>.<lambda>(X)>

# Evaluate Results

In [82]:
X_test.columns, submission_df.columns

(Index(['Age', 'fnlwgt', 'education-num', 'race', 'capital gain',
        'capital loss', 'hours per week', 'workclass_?',
        'workclass_Federal-gov', 'workclass_Local-gov',
        ...
        'native-country_Portugal', 'native-country_Puerto-Rico',
        'native-country_Scotland', 'native-country_South',
        'native-country_Taiwan', 'native-country_Thailand',
        'native-country_Trinadad&Tobago', 'native-country_United-States',
        'native-country_Vietnam', 'native-country_Yugoslavia'],
       dtype='object', length=101),
 Index(['Age', 'fnlwgt', 'education-num', 'race', 'gender', 'capital gain',
        'capital loss', 'hours per week', 'workclass_?',
        'workclass_Federal-gov',
        ...
        'native-country_Portugal', 'native-country_Puerto-Rico',
        'native-country_Scotland', 'native-country_South',
        'native-country_Taiwan', 'native-country_Thailand',
        'native-country_Trinadad&Tobago', 'native-country_United-States',
        'native

In [83]:
# Make Predictions
predictions = best_classifier(X_test)

In [84]:
def indicator(predictions):
    predictions[predictions < 0.5] = 0
    predictions[predictions >= 0.5] = 1
    return predictions
predictions = indicator(predictions)

In [85]:
accuracy = accuracy_score(y_test, predictions) * 100
print('Accuracy: ', round(accuracy, 2), '%.')

Accuracy:  85.77 %.


In [86]:
# X_pred includes income columns that is predicted by the model
X_pred = test_df.drop('income', axis=1)
X_pred['income'] = predictions

In [87]:
A0 = len(X_pred.loc[X_pred['gender'] == 0])
A1 = len(X_pred.loc[X_pred['gender'] == 1])

Y1A0 = len(X_pred.loc[(X_pred['income'] == 1) & (X_pred['gender'] == 0)])
Y1A1 = len(X_pred.loc[(X_pred['income'] == 1) & (X_pred['gender'] == 1)])

In [88]:
Th = 0.1
DDP = np.abs((Y1A0 / A0) - (Y1A1 / A1))
bias = (7**(DDP - Th) - 1) if DDP > Th else 0
score = accuracy - bias
print(f'Accuracy: {accuracy}\nBias: {bias}\nScore: {score}')

Accuracy: 85.7730354562632
Bias: 0
Score: 85.7730354562632


In [94]:
submission_df_ = submission_df.drop(['gender'], axis=1)
submission_predictions = indicator(best_classifier(submission_df_))

In [95]:
import datetime

file_name = '../output/test_sample_' + str(datetime.datetime.now()).replace(' ', '').replace(':','') + '.csv'
file_name

'../output/test_sample_2019-04-14205832.655111.csv'

In [101]:
submission = pd.Series(submission_predictions, dtype=np.int64)
submission

0       0
1       0
2       1
3       0
4       0
5       0
6       0
7       0
8       0
9       0
10      0
11      0
12      0
13      0
14      0
15      1
16      0
17      1
18      0
19      0
20      0
21      1
22      0
23      0
24      0
25      1
26      0
27      0
28      0
29      1
       ..
8967    0
8968    0
8969    0
8970    0
8971    1
8972    0
8973    0
8974    0
8975    0
8976    0
8977    1
8978    1
8979    0
8980    0
8981    0
8982    0
8983    0
8984    0
8985    1
8986    0
8987    0
8988    1
8989    0
8990    0
8991    1
8992    0
8993    0
8994    0
8995    0
8996    0
Length: 8997, dtype: int64

In [102]:
submission.to_csv(file_name, index_label='Id', header=['income'], index=True)