# Import Libraries

In [1]:
import numpy as np
import pandas as pd
import fairlearn.moments as moments
import fairlearn.classred as red
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Define Regressor Class

# Get Files and Inputs

In [2]:
import os
print(os.listdir('../input/fair-classification'))

['test_no_income.csv', 'test_sample0.csv', 'train.csv']


In [3]:
df = pd.read_csv('../input/fair-classification/train.csv')
submission_df = pd.read_csv('../input/fair-classification/test_no_income.csv')

In [4]:
df.shape

(35988, 15)

In [5]:
submission_df.shape

(8997, 14)

In [6]:
# encode string data
df = pd.get_dummies(df)
submission_df = pd.get_dummies(submission_df)

In [7]:
missing_cols = set(df.columns) - set(submission_df.columns)

for c in missing_cols:
    submission_df[c] = 0
    
# Ensure Ordering matches
submission_df = submission_df[df.columns].drop('income', axis=1)

# Organize data into training, testing, features(X), sensitive attributes(A), targets(y)

In [8]:
train_df, test_df = train_test_split(df, test_size=0.25)
X_train, A_train, y_train = train_df.drop(['income','gender'], axis=1), train_df['gender'], train_df['income']
X_test, A_test, y_test = test_df.drop(['income','gender'], axis=1), test_df['gender'], test_df['income']

In [9]:
submission_df.shape[1], X_test.shape[1]

(102, 101)

# Using fairlearn to find best model

In [16]:
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

class LogisticRegressionLearner():
    def __init__(self):
        self.model = None
    def fit(self, X, Y, W):
        self.model = LogisticRegression(C=0.3, solver='liblinear', tol=1e-3, penalty='l1', verbose=1, max_iter=100000)
        self.model.fit(X, Y, sample_weight=W)

    def predict(self, X):
        return self.model.predict(X)


learner = LogisticRegressionLearner()
res_tuple = red.expgrad(X_train, A_train, y_train, learner,
                        cons=moments.DP(), eps=0.1)
res = res_tuple._asdict()
best_classifier = res['best_classifier']

# Make Predictions
predictions = best_classifier(X_test)

def indicator(predictions):
    predictions[predictions < 0.5] = 0
    predictions[predictions >= 0.5] = 1
    return predictions

predictions = indicator(predictions)
accuracy = accuracy_score(y_test, predictions) * 100
print('Accuracy: ', round(accuracy, 2), '%.')

# X_pred includes income columns that is predicted by the model
X_pred = test_df.drop('income', axis=1)
X_pred['income'] = predictions

A0 = len(X_pred.loc[X_pred['gender'] == 0])
A1 = len(X_pred.loc[X_pred['gender'] == 1])

Y1A0 = len(X_pred.loc[(X_pred['income'] == 1) & (X_pred['gender'] == 0)])
Y1A1 = len(X_pred.loc[(X_pred['income'] == 1) & (X_pred['gender'] == 1)])

Th = 0.1
DDP = np.abs((Y1A0 / A0) - (Y1A1 / A1))
bias = (7**(DDP - Th) - 1) if DDP > Th else 0
score = accuracy - bias
print(f'Accuracy: {accuracy}\nBias: {bias}\nScore: {score}')

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]Accuracy:  84.94 %.
Accuracy: 84.93942425252861
Bias: 0
Score: 84.93942425252861


# Submission

In [None]:
submission_df_ = submission_df.drop(['gender'], axis=1)
submission_predictions = indicator(best_classifier(submission_df_))

In [None]:
import datetime

file_name = '../output/test_sample_' + str(datetime.datetime.now()).replace(' ', '').replace(':','') + '.csv'
file_name

In [None]:
submission = pd.Series(submission_predictions, dtype=np.int64)

In [None]:
submission.to_csv(file_name, index_label='Id', header=['income'], index=True)