In [244]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['fair-classification']


In [245]:
print(os.listdir('../input/fair-classification'))

['test_no_income.csv', 'test_sample0.csv', 'train.csv']


In [246]:
df = pd.read_csv('../input/fair-classification/train.csv')
submission_df = pd.read_csv('../input/fair-classification/test_no_income.csv')

In [247]:
df.shape

(35988, 15)

In [248]:
submission_df.columns

Index(['Age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'gender',
       'capital gain', 'capital loss', 'hours per week', 'native-country'],
      dtype='object')

In [249]:
df = pd.get_dummies(df)
submission_df = pd.get_dummies(submission_df)

In [250]:
df.columns

Index(['Age', 'fnlwgt', 'education-num', 'race', 'gender', 'capital gain',
       'capital loss', 'hours per week', 'income', 'workclass_?',
       ...
       'native-country_Portugal', 'native-country_Puerto-Rico',
       'native-country_Scotland', 'native-country_South',
       'native-country_Taiwan', 'native-country_Thailand',
       'native-country_Trinadad&Tobago', 'native-country_United-States',
       'native-country_Vietnam', 'native-country_Yugoslavia'],
      dtype='object', length=103)

In [251]:
missing_cols = set(df.columns) - set(submission_df.columns)

for c in missing_cols:
    submission_df[c] = 0
    
# Ensure Ordering matches
submission_df = submission_df[df.columns].drop('income', axis=1)

In [252]:
submission_df.columns

Index(['Age', 'fnlwgt', 'education-num', 'race', 'gender', 'capital gain',
       'capital loss', 'hours per week', 'workclass_?',
       'workclass_Federal-gov',
       ...
       'native-country_Portugal', 'native-country_Puerto-Rico',
       'native-country_Scotland', 'native-country_South',
       'native-country_Taiwan', 'native-country_Thailand',
       'native-country_Trinadad&Tobago', 'native-country_United-States',
       'native-country_Vietnam', 'native-country_Yugoslavia'],
      dtype='object', length=102)

In [253]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.25)

In [254]:
X_train, y_train = train_df.drop('income', axis=1), train_df['income']
X_test, y_test = test_df.drop('income', axis=1), test_df['income']

In [255]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score 

model = LogisticRegression(C=0.3, solver='liblinear', tol=1e-3, penalty='l1', verbose=1, max_iter=100000)
model.fit(X_train, y_train)

[LibLinear]

LogisticRegression(C=0.3, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100000, multi_class='warn',
          n_jobs=None, penalty='l1', random_state=None, solver='liblinear',
          tol=0.001, verbose=1, warm_start=False)

In [256]:
# Make Predictions
predictions = model.predict(X_test)

In [257]:
accuracy = accuracy_score(y_test, predictions) * 100
parameters = model.coef_
print('Accuracy: ', round(accuracy, 2), '%.')

Accuracy:  84.88 %.


In [258]:
# X_pred includes income columns that is predicted by the model
X_pred = X_test.loc[:]
X_pred['income'] = predictions

In [259]:
A0 = len(X_pred.loc[X_pred['gender'] == 0])
A1 = len(X_pred.loc[X_pred['gender'] == 1])

Y1A0 = len(X_pred.loc[(X_pred['income'] == 1) & (X_pred['gender'] == 0)])
Y1A1 = len(X_pred.loc[(X_pred['income'] == 1) & (X_pred['gender'] == 1)])

In [260]:
Th = 0.1
DDP = np.abs((Y1A0 / A0) - (Y1A1 / A1))
bias = (7**(DDP - Th) - 1) if DDP > Th else 0
score = accuracy - bias
print(f'Accuracy: {accuracy}\nBias: {bias}\nScore: {score}')

Accuracy: 84.88385017227965
Bias: 0.14955211043346384
Score: 84.73429806184619


In [261]:
submission_predictions = model.predict(submission_df)

In [262]:
import datetime

file_name = '../output/test_sample_' + str(datetime.datetime.now()).replace(' ', '').replace(':','') + '.csv'
file_name

'../output/test_sample_2019-04-14011152.627752.csv'

In [263]:
submission = pd.Series(submission_predictions)
submission

0       0
1       0
2       1
3       0
4       0
5       0
6       0
7       0
8       0
9       0
10      0
11      0
12      0
13      0
14      0
15      1
16      0
17      1
18      0
19      0
20      0
21      1
22      0
23      0
24      0
25      1
26      0
27      0
28      0
29      1
       ..
8967    0
8968    0
8969    0
8970    0
8971    1
8972    0
8973    0
8974    0
8975    0
8976    0
8977    1
8978    1
8979    0
8980    0
8981    0
8982    0
8983    0
8984    0
8985    1
8986    0
8987    0
8988    0
8989    0
8990    0
8991    0
8992    0
8993    0
8994    0
8995    0
8996    0
Length: 8997, dtype: int64

In [264]:
submission.to_csv(file_name, index_label='Id', header=['income'], index=True)