In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['fair-classification']


In [2]:
print(os.listdir('../input/fair-classification'))

['test_no_income.csv', 'test_sample0.csv', 'train.csv']


In [3]:
df = pd.read_csv('../input/fair-classification/train.csv')
submission_df = pd.read_csv('../input/fair-classification/test_no_income.csv')

In [4]:
df.shape

(35988, 15)

In [5]:
submission_df.columns

Index(['Age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'gender',
       'capital gain', 'capital loss', 'hours per week', 'native-country'],
      dtype='object')

In [6]:
df = pd.get_dummies(df)
submission_df = pd.get_dummies(submission_df)

In [7]:
df.columns

Index(['Age', 'fnlwgt', 'education-num', 'race', 'gender', 'capital gain',
       'capital loss', 'hours per week', 'income', 'workclass_?',
       ...
       'native-country_Portugal', 'native-country_Puerto-Rico',
       'native-country_Scotland', 'native-country_South',
       'native-country_Taiwan', 'native-country_Thailand',
       'native-country_Trinadad&Tobago', 'native-country_United-States',
       'native-country_Vietnam', 'native-country_Yugoslavia'],
      dtype='object', length=103)

In [8]:
missing_cols = set(df.columns) - set(submission_df.columns)

for c in missing_cols:
    submission_df[c] = 0
    
# Ensure Ordering matches
submission_df = submission_df[df.columns].drop('income', axis=1)

In [9]:
submission_df.columns

Index(['Age', 'fnlwgt', 'education-num', 'race', 'gender', 'capital gain',
       'capital loss', 'hours per week', 'workclass_?',
       'workclass_Federal-gov',
       ...
       'native-country_Portugal', 'native-country_Puerto-Rico',
       'native-country_Scotland', 'native-country_South',
       'native-country_Taiwan', 'native-country_Thailand',
       'native-country_Trinadad&Tobago', 'native-country_United-States',
       'native-country_Vietnam', 'native-country_Yugoslavia'],
      dtype='object', length=102)

In [10]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.25)

In [11]:
X_train, y_train = train_df.drop('income', axis=1), train_df['income']
X_test, y_test = test_df.drop('income', axis=1), test_df['income']

In [24]:
from sklearn.ensemble import RandomForestClassifier

# Training
rf = RandomForestClassifier(n_estimators=1000, random_state=42, verbose=1)
rf.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:  1.5min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
            oob_score=False, random_state=42, verbose=1, warm_start=False)

In [25]:
# Make Predictions
predictions = rf.predict(X_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    5.5s finished


In [30]:
# Determine absolute error
errors = abs(predictions - y_test)

# Determine number of errors
num_correct = errors.value_counts()[0]
num_incorrect = errors.value_counts()[1]


# Determine accuracy
accuracy = (1 - (num_incorrect / num_correct))

In [31]:
# X_pred includes income columns that is predicted by the model
X_pred = X_test.loc[:]
X_pred['income'] = predictions
X_pred.head()

Unnamed: 0,Age,fnlwgt,education-num,race,gender,capital gain,capital loss,hours per week,workclass_?,workclass_Federal-gov,...,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia,income
8862,44,216116,9,1,1,0,0,40,0,0,...,0,0,0,0,0,0,0,0,0,0
1702,27,142621,13,0,1,4101,0,40,0,0,...,0,0,0,0,0,0,1,0,0,0
12895,30,256970,14,0,1,0,0,35,0,0,...,0,0,0,0,0,0,1,0,0,1
17580,19,104112,9,1,0,0,0,30,0,0,...,0,0,0,0,0,0,0,0,0,0
10682,48,118889,9,0,0,2885,0,15,0,0,...,0,0,0,0,0,0,1,0,0,0


In [32]:
A0 = len(X_pred.loc[X_pred['gender'] == 0])
A1 = len(X_pred.loc[X_pred['gender'] == 1])

Y1A0 = len(X_pred.loc[(X_pred['income'] == 1) & (X_pred['gender'] == 0)])
Y1A1 = len(X_pred.loc[(X_pred['income'] == 1) & (X_pred['gender'] == 1)])

A0, A1, Y1A0, Y1A1

(5975, 3022, 1636, 256)

In [33]:
Th = 0.1
DDP = np.abs((Y1A0 / A0) - (Y1A1 / A1))
bias = (7**(DDP - Th) - 1) if DDP > Th else 0
score = accuracy - bias
print(f'Accuracy: {accuracy}\nBias: {bias}\nScore: {score}')

Accuracy: 0.8315584415584416
Bias: 0.18930806781385368
Score: 0.6422503737445879


In [None]:
# Evaluate

importances = list(rf.feature_importances_)
feature_importances = [(feature, round(importance, 5)) for feature, importance in zip(X_train.columns, importances)]
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse=True)
list_of_importances = [print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]

In [None]:
import datetime

file_name = '../output/test_sample_' + str(datetime.datetime.now()).replace(' ', '').replace(':','') + '.csv'
file_name

In [None]:
submission = pd.Series(predictions)
submission

In [None]:
submission.to_csv(file_name, index_label='Id', header=['income'], index=True)