In [1]:
import xgboost as xgb
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [2]:
dataset = pd.read_csv('adult.csv') #read file with data set
# Let's see first 10 lines of our table just to know whith what we will work
dataset.head(10)

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K
5,34,Private,216864,HS-grad,9,Divorced,Other-service,Unmarried,White,Female,0,3770,45,United-States,<=50K
6,38,Private,150601,10th,6,Separated,Adm-clerical,Unmarried,White,Male,0,3770,40,United-States,<=50K
7,74,State-gov,88638,Doctorate,16,Never-married,Prof-specialty,Other-relative,White,Female,0,3683,20,United-States,>50K
8,68,Federal-gov,422013,HS-grad,9,Divorced,Prof-specialty,Not-in-family,White,Female,0,3683,40,United-States,<=50K
9,41,Private,70037,Some-college,10,Never-married,Craft-repair,Unmarried,White,Male,0,3004,60,?,>50K


In [3]:
# We can see that there are 32561 instances(lines) and 15 attributes(columns) in the data set.
dataset.shape

(32561, 15)

In [4]:
# This is all columns from data set
dataset.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education.num',
       'marital.status', 'occupation', 'relationship', 'race', 'sex',
       'capital.gain', 'capital.loss', 'hours.per.week', 'native.country',
       'income'],
      dtype='object')

In [5]:
# We can see that our table has missing values "?"(for example line 0, line 2)
# We have to correct it, because if not our program will think that person 0 and person 2 
# have the same workclass which call "?" and have the same occupation also "?",
# I guess it is not true so I just change all "?" to NaN value
dataset[dataset == "?"] = np.nan
dataset.head(3)

  result = method(y)


Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,,77053,HS-grad,9,Widowed,,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,,186061,Some-college,10,Widowed,,Unmarried,Black,Female,0,4356,40,United-States,<=50K


In [6]:
# Ok now Thre is NaN 

In [7]:
# In this part of code we change NaN with the most frequent value
for col in dataset.columns:
    dataset[col].fillna(dataset[col].mode()[0], inplace=True)
dataset.head(10)

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,Private,77053,HS-grad,9,Widowed,Prof-specialty,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,Private,186061,Some-college,10,Widowed,Prof-specialty,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K
5,34,Private,216864,HS-grad,9,Divorced,Other-service,Unmarried,White,Female,0,3770,45,United-States,<=50K
6,38,Private,150601,10th,6,Separated,Adm-clerical,Unmarried,White,Male,0,3770,40,United-States,<=50K
7,74,State-gov,88638,Doctorate,16,Never-married,Prof-specialty,Other-relative,White,Female,0,3683,20,United-States,>50K
8,68,Federal-gov,422013,HS-grad,9,Divorced,Prof-specialty,Not-in-family,White,Female,0,3683,40,United-States,<=50K
9,41,Private,70037,Some-college,10,Never-married,Craft-repair,Unmarried,White,Male,0,3004,60,United-States,>50K


In [8]:
dataset.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education.num      int64
marital.status    object
occupation        object
relationship      object
race              object
sex               object
capital.gain       int64
capital.loss       int64
hours.per.week     int64
native.country    object
income            object
dtype: object

In [9]:
# Above we see all types of columns I think "age" don't need such big number like int64,
# so I chage to smaller number int8
dataset.astype({'age':'int8', 'education.num':'int8','hours.per.week':'int8'}).dtypes


age                 int8
workclass         object
fnlwgt             int64
education         object
education.num       int8
marital.status    object
occupation        object
relationship      object
race              object
sex               object
capital.gain       int64
capital.loss       int64
hours.per.week      int8
native.country    object
income            object
dtype: object

In [10]:
# Classify columns to numerical group (comparable values) and categorical group (non comparable group)
CATEGORICAL = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']
NUMERICAL = ['age', 'fnlwgt', 'education.num', 'capital.gain', 'capital.loss', 'hours.per.week']

In [11]:
# We do this becouse it is not comfortable to work with string values and we will hash(encrypt) it to numvers,
# the same values correspond to the same number, the difference values correspond to the different numbers
# we use function fit_transform to do this

In [12]:
for i in CATEGORICAL:
    dataset[i] = LabelEncoder().fit_transform(dataset[i])

In [13]:
# Lts's see what happen, there are only numbers now
dataset.sample(10)

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
32181,50,3,187465,1,7,0,11,1,4,0,0,0,40,38,<=50K
27179,34,3,381153,11,9,2,3,0,4,1,0,0,40,38,>50K
15173,25,3,155320,9,13,4,3,3,4,1,0,0,45,38,<=50K
27970,40,3,116632,9,13,4,11,3,4,1,0,0,40,38,<=50K
2719,36,3,143486,11,9,2,13,0,4,1,7298,0,50,38,>50K
29108,32,3,99646,11,9,2,13,0,4,1,0,0,50,38,<=50K
558,57,3,98350,14,15,2,2,0,1,1,0,1902,40,29,>50K
28506,18,3,236272,11,9,4,0,3,4,0,0,0,35,38,<=50K
14017,20,3,223515,15,10,4,7,3,4,1,0,0,40,38,<=50K
30853,64,1,31993,5,4,2,10,0,4,1,0,0,10,38,<=50K


In [14]:
# We do the same with 'income' but manually
dataset['income'] = dataset['income'].apply(lambda x: 0 if(x == '<=50K') else 1)


In [15]:
# look 'income' was changed
dataset.sample(5)

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
12869,20,0,55233,15,10,4,0,2,4,0,0,0,40,38,0
5077,34,3,202046,11,9,2,2,0,4,1,0,0,35,38,1
7620,22,3,73203,11,9,4,0,3,4,0,0,0,40,38,0
23099,22,6,157332,15,10,4,7,3,4,0,0,0,15,38,0
29611,42,1,1125613,11,9,0,7,1,2,1,0,0,40,38,0


In [16]:
# Divide the table into input and output,  
y = dataset.pop('income') #output
X = dataset.copy() #input
# it is seems like function in math y(x), so we have x and must find y 

In [17]:
# Split data into separate training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, train_size=0.2)

In [18]:
X_train.shape, X_test.shape

((6512, 14), (26049, 14))

In [20]:
# METHOD 1 RANDOM FOREST
# I decided to do this task with random forest, so I create the object "random_forest"
model_random_forest = RandomForestClassifier()
model_random_forest.fit(X_train, y_train) # Build a forest of trees from the training set (X_train, y_train).

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [21]:
y_pred_random_forest = model_random_forest.predict(X_test) # Predict 

In [22]:
y_pred_prob_random_forest = model_random_forest.predict_proba(X_test) # Predict proba

In [23]:
# Use Accuracy Score
metrics.accuracy_score(y_test, y_pred_random_forest)

0.8513570578525087

In [24]:
metrics.roc_auc_score(y_test, y_pred_prob_random_forest[:, 1])

0.9004500954749193

In [25]:
# We also can use Confusion Matrix 
conf_matrix = metrics.confusion_matrix(y_test, y_pred_random_forest)
conf_matrix

array([[18428,  1317],
       [ 2555,  3749]])

In [26]:
# So we have confusion matrix which look like this:
#            Predicted 0   Predicted 1
#  Actual 0   18448           1334
#  Actual 1    2439           3828

tn = conf_matrix[0,0] # true negative
fp = conf_matrix[0,1] # false positive
fn = conf_matrix[1,0] # false negative
tp = conf_matrix[1,1] # true positive

In [27]:
# how often is the classifier correct
(tn+tp)/(tn+fp+fn+tp)

0.8513570578525087

In [28]:
# Aha! Have the same result. But the plus of confusion matrix is that we can calculate variety of metrix:
# For exapmle:
# When the actual value is positive, how often is the prediction correct?
print(tp/(tp+fn))
# When the actual value is negative, how often is the prediction correct?
print(tn/(tn+fp))

0.5947017766497462
0.9332995695112687


In [29]:
# We have the same value with accuracy_score and Confussion_matrix
# But let's see how many 1 and 0 in test set (y_test)
y_test.value_counts()

0    19745
1     6304
Name: income, dtype: int64

In [30]:
# So we have about 80% of 0, this is high imbalance 
# This means that even if we just print only 0 we will gave about 80% of correct answers
# Maybe we should use other method ???

In [31]:
# Let's try roc_auc_score
# AUC is useful even when there is high imbalance (unlike classification accuracy)
metrics.roc_auc_score(y_test, y_pred_random_forest)

0.7640006730805075

In [32]:
# Now we see other result

In [33]:
# METHOD 2 XGB
# Do the same as above but using another classifier algorithm
model_xgb = xgb.XGBClassifier()
model_xgb.fit(X_train, y_train)
y_pred_xgb = model_xgb.predict(X_test)
y_pred_prob_xgb = model_xgb.predict_proba(X_test)


In [34]:
metrics.accuracy_score(y_test, y_pred_xgb)

0.861760528235249

In [35]:
metrics.roc_auc_score(y_test, y_pred_prob_xgb[:, 1])

0.9156058793076189

In [36]:
# Conclusions:
# When we use random forest classifier we have accuracy about 90%
# When we use XGBCLassifier we have accuracy about 92%
# So I think better to use XGBCLassifier in this task