In [1]:
##Importing libraries

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

##Algorithms used for prediction
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

##sklearn tools used for standardizing, normalising, predicting and reporting
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn import decomposition, datasets
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

import warnings
warnings.filterwarnings('ignore')

In [3]:
##Reading dataset (you can find it easily at my github project folder)

dataset = pd.read_csv("../input/adult.csv")

In [4]:
##Peeking at data

dataset.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [5]:
##Mapping income as a boolean

dataset['income']=dataset['income'].map({'<=50K': 0, '>50K': 1, '<=50K.': 0, '>50K.': 1})

In [6]:
##Cleaning bad entries

dataset.replace('?',np.nan )

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,,77053,HS-grad,9,Widowed,,Not-in-family,White,Female,0,4356,40,United-States,0
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,0
2,66,,186061,Some-college,10,Widowed,,Unmarried,Black,Female,0,4356,40,United-States,0
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,0
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,0
5,34,Private,216864,HS-grad,9,Divorced,Other-service,Unmarried,White,Female,0,3770,45,United-States,0
6,38,Private,150601,10th,6,Separated,Adm-clerical,Unmarried,White,Male,0,3770,40,United-States,0
7,74,State-gov,88638,Doctorate,16,Never-married,Prof-specialty,Other-relative,White,Female,0,3683,20,United-States,1
8,68,Federal-gov,422013,HS-grad,9,Divorced,Prof-specialty,Not-in-family,White,Female,0,3683,40,United-States,0
9,41,Private,70037,Some-college,10,Never-married,Craft-repair,Unmarried,White,Male,0,3004,60,,1


In [7]:
##Since 'fnlwht' has no correlation in this test it will be dropped

dataset.drop(['fnlwgt'], axis = 1, inplace = True)

In [8]:
##Splitting the dataset

features = dataset['income']
outcome = dataset.drop(['income'], axis=1)

In [9]:
##Splitting the data into test and training

outcome_train, outcome_test, features_train, features_test = train_test_split(outcome, features, test_size = 0.3, random_state = 0)

In [10]:
##Standardizing the data set

cat_features = ['workclass','education','marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']

for feature in cat_features:
        label_encoder = preprocessing.LabelEncoder()
        outcome_train[feature] = label_encoder.fit_transform(outcome_train[feature])
        outcome_test[feature] = label_encoder.transform(outcome_test[feature])

In [11]:
##Using StandardScalar to normalise the dataset

standard_scalar = StandardScaler()
outcome_train = pd.DataFrame(standard_scalar.fit_transform(outcome_train), columns = outcome.columns)
outcome_test = pd.DataFrame(standard_scalar.transform(outcome_test), columns = outcome.columns)

In [12]:
outcome_train.head()

Unnamed: 0,age,workclass,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
0,0.101484,2.134215,-0.332263,1.133894,-0.402341,-0.60027,2.214196,0.39298,-1.43047,-0.145189,-0.217407,-1.662414,0.292864
1,0.028248,-1.279379,0.184396,-0.423425,-0.402341,0.109933,-0.89941,0.39298,0.699071,-0.145189,-0.217407,-0.200753,0.292864
2,0.247956,0.086059,1.217715,-0.034095,0.926666,-0.60027,-0.276689,0.39298,-1.43047,-0.145189,-0.217407,-0.038346,0.292864
3,-0.850587,-1.279379,0.184396,-0.423425,0.926666,-0.363535,0.968753,0.39298,0.699071,-0.145189,-0.217407,-0.038346,0.292864
4,-0.044989,-1.962098,0.442726,1.523223,-0.402341,-0.60027,-0.89941,0.39298,0.699071,-0.145189,-0.217407,-0.038346,0.292864


In [13]:
##Random forest

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(outcome_train, features_train)

accuracy_random_forest = round(random_forest.score(outcome_train, features_train) * 100, 2)

In [14]:
##Logistic Regression

logreg = LogisticRegression()
logreg.fit(outcome_train, features_train)

accuracy_log = round(logreg.score(outcome_train, features_train) * 100, 2)

In [15]:
##KNN

knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(outcome_train, features_train)

accuracy_knn = round(knn.score(outcome_train, features_train) * 100, 2)

In [16]:
##Decision Tree 

decision_tree = DecisionTreeClassifier()
decision_tree.fit(outcome_train, features_train)

accuracy_decision_tree = round(decision_tree.score(outcome_train, features_train) * 100, 2)

In [17]:
##Plotting the accuracy of the algorithms

results = pd.DataFrame({
    'Model': ['KNN', 'Logistic Regression', 'Random Forest', 'Decision Tree'],
    'Accuracy': [accuracy_knn, accuracy_log, accuracy_random_forest, accuracy_decision_tree]})

results.head()

Unnamed: 0,Model,Accuracy
0,KNN,89.83
1,Logistic Regression,82.63
2,Random Forest,98.12
3,Decision Tree,98.14


In [18]:
##Finding importance of each feature

importances = pd.DataFrame({'feature':outcome_train.columns,'importance':np.round(random_forest.feature_importances_,3)})
importances = importances.sort_values('importance',ascending=False).set_index('feature')
importances

Unnamed: 0_level_0,importance
feature,Unnamed: 1_level_1
age,0.213
capital.gain,0.124
hours.per.week,0.111
education.num,0.107
relationship,0.106
occupation,0.086
marital.status,0.075
workclass,0.051
capital.loss,0.04
education,0.033


## From here on out we will be using the chosen algo

In [36]:
##Random forest with hyperparameters

random_forest = RandomForestClassifier(n_estimators=100)

n_estimators = [100, 300, 500, 800, 1200]
max_depth = [5, 8, 15, 25, 30]
min_samples_split = [2, 5, 10, 15, 100]
min_samples_leaf = [1, 2, 5, 10] 

hyperF = dict(n_estimators = n_estimators, max_depth = max_depth,  
              min_samples_split = min_samples_split, 
             min_samples_leaf = min_samples_leaf)

gridF = GridSearchCV(random_forest, hyperF, cv = 3, verbose = 1, n_jobs = -1)
bestF = gridF.fit(outcome_train, features_train)

bestF.best_estimator_.get_params()

Fitting 3 folds for each of 500 candidates, totalling 1500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   10.2s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed:  6.8min
[Parallel(n_jobs=-1)]: Done 1226 tasks      | elapsed: 12.5min
[Parallel(n_jobs=-1)]: Done 1500 out of 1500 | elapsed: 16.1min finished


{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 15,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 1200,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [37]:
classes = bestF.predict(outcome_train)
    
accuracy = metrics.accuracy_score(classes, features_train)
balanced_accuracy = metrics.balanced_accuracy_score(classes, features_train)
precision = metrics.precision_score(classes, features_train)
average_precision = metrics.average_precision_score(classes, features_train)
f1_score = metrics.f1_score(classes, features_train)
recall = metrics.recall_score(classes, features_train)

print(metrics.classification_report(classes, features_train))

              precision    recall  f1-score   support

           0       0.97      0.92      0.94     18302
           1       0.72      0.88      0.79      4490

   micro avg       0.91      0.91      0.91     22792
   macro avg       0.84      0.90      0.87     22792
weighted avg       0.92      0.91      0.91     22792



In [45]:
print('Model accuracy is',bestF.score(outcome_train, features_train))

Model accuracy is 0.9085644085644086


In [38]:
##Logistic Regression with hyperparameters

logreg = LogisticRegression()

solvers = ['newton-cg', 'lbfgs', 'sag', 'saga', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]

hyperR = dict(solver = solvers, penalty = penalty, C = c_values)

gridR = GridSearchCV(logreg, hyperR, cv = 3, verbose = 1, n_jobs = -1)
bestR = gridR.fit(outcome_train, features_train)

bestR.best_estimator_.get_params()

Fitting 3 folds for each of 25 candidates, totalling 75 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed:    1.0s finished


{'C': 0.1,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'max_iter': 100,
 'multi_class': 'warn',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'liblinear',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [39]:
classes = bestR.predict(outcome_train)
    
accuracy = metrics.accuracy_score(classes, features_train)
balanced_accuracy = metrics.balanced_accuracy_score(classes, features_train)
precision = metrics.precision_score(classes, features_train)
average_precision = metrics.average_precision_score(classes, features_train)
f1_score = metrics.f1_score(classes, features_train)
recall = metrics.recall_score(classes, features_train)

print(metrics.classification_report(classes, features_train))

              precision    recall  f1-score   support

           0       0.94      0.85      0.89     19296
           1       0.46      0.72      0.56      3496

   micro avg       0.83      0.83      0.83     22792
   macro avg       0.70      0.78      0.73     22792
weighted avg       0.87      0.83      0.84     22792



In [44]:
print('Model accuracy is',bestR.score(outcome_train, features_train))

Model accuracy is 0.8263425763425764


In [47]:
##KNN with hyperparameters

knn = KNeighborsClassifier()

k_range = range(1, 31)
leaf_range = range(1, 50)

hyperK = dict(n_neighbors = k_range, leaf_size = leaf_range)

gridK = GridSearchCV(knn, hyperK, cv = 3, verbose = 1, n_jobs = -1)
bestK = gridK.fit(outcome_train, features_train)

bestK.best_estimator_.get_params()

Fitting 3 folds for each of 1470 candidates, totalling 4410 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   52.5s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  6.7min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed: 14.1min
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed: 23.0min
[Parallel(n_jobs=-1)]: Done 1226 tasks      | elapsed: 33.4min
[Parallel(n_jobs=-1)]: Done 1776 tasks      | elapsed: 44.9min
[Parallel(n_jobs=-1)]: Done 2426 tasks      | elapsed: 57.9min
[Parallel(n_jobs=-1)]: Done 3176 tasks      | elapsed: 71.2min
[Parallel(n_jobs=-1)]: Done 4026 tasks      | elapsed: 85.9min
[Parallel(n_jobs=-1)]: Done 4410 out of 4410 | elapsed: 92.5min finished


{'algorithm': 'auto',
 'leaf_size': 1,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 27,
 'p': 2,
 'weights': 'uniform'}

In [48]:
classes = bestK.predict(outcome_train)
    
accuracy = metrics.accuracy_score(classes, features_train)
balanced_accuracy = metrics.balanced_accuracy_score(classes, features_train)
precision = metrics.precision_score(classes, features_train)
average_precision = metrics.average_precision_score(classes, features_train)
f1_score = metrics.f1_score(classes, features_train)
recall = metrics.recall_score(classes, features_train)

print(metrics.classification_report(classes, features_train))

              precision    recall  f1-score   support

           0       0.93      0.88      0.90     18320
           1       0.59      0.73      0.65      4472

   micro avg       0.85      0.85      0.85     22792
   macro avg       0.76      0.80      0.78     22792
weighted avg       0.86      0.85      0.85     22792



In [49]:
print('Model accuracy is',bestK.score(outcome_train, features_train))

Model accuracy is 0.8482800982800983


In [41]:
##Decision Tree with hyperparameters

decision_tree = DecisionTreeClassifier()

criterion = ['gini', 'entropy']
max_depth = range(1, 10)
min_samples_split = range(2, 10)
min_samples_leaf = range(1, 5)
max_leaf_nodes = range(2, 100)

hyperDT = dict(criterion = criterion, max_depth = max_depth, min_samples_split = min_samples_split,
               min_samples_leaf = min_samples_leaf, max_leaf_nodes = max_leaf_nodes)

gridDT = GridSearchCV(decision_tree, hyperDT, cv = 3, verbose = 1, n_jobs = -1)
bestDT = gridDT.fit(outcome_train, features_train)

bestDT.best_estimator_.get_params()

Fitting 3 folds for each of 56448 candidates, totalling 169344 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  44 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 1544 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 4044 tasks      | elapsed:    9.0s
[Parallel(n_jobs=-1)]: Done 7544 tasks      | elapsed:   16.4s
[Parallel(n_jobs=-1)]: Done 12044 tasks      | elapsed:   26.6s
[Parallel(n_jobs=-1)]: Done 17544 tasks      | elapsed:   40.9s
[Parallel(n_jobs=-1)]: Done 24044 tasks      | elapsed:   59.7s
[Parallel(n_jobs=-1)]: Done 31544 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 40044 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 49544 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 60044 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 71544 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 84044 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done 97544 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done 1120

{'class_weight': None,
 'criterion': 'gini',
 'max_depth': 9,
 'max_features': None,
 'max_leaf_nodes': 48,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 3,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'presort': False,
 'random_state': None,
 'splitter': 'best'}

In [42]:
classes = bestDT.predict(outcome_train)
    
accuracy = metrics.accuracy_score(classes, features_train)
balanced_accuracy = metrics.balanced_accuracy_score(classes, features_train)
precision = metrics.precision_score(classes, features_train)
average_precision = metrics.average_precision_score(classes, features_train)
f1_score = metrics.f1_score(classes, features_train)
recall = metrics.recall_score(classes, features_train)

print(metrics.classification_report(classes, features_train))

              precision    recall  f1-score   support

           0       0.95      0.88      0.91     18558
           1       0.60      0.78      0.68      4234

   micro avg       0.86      0.86      0.86     22792
   macro avg       0.77      0.83      0.80     22792
weighted avg       0.88      0.86      0.87     22792



In [46]:
print('Model accuracy is',bestDT.score(outcome_train, features_train))

Model accuracy is 0.8632853632853633


Accuracy has unexpectedly decreased. It's possible that my hyperparameter tuning is causing overffiting (?), or perhaps the low cross validation number (3) might be adversely affecting the tests.