In [65]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [66]:
dataset = pd.read_csv('adult.csv') #read file with data set
# Let's see first 10 lines of our table just to know whith what we will work
dataset.head(10)

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K
5,34,Private,216864,HS-grad,9,Divorced,Other-service,Unmarried,White,Female,0,3770,45,United-States,<=50K
6,38,Private,150601,10th,6,Separated,Adm-clerical,Unmarried,White,Male,0,3770,40,United-States,<=50K
7,74,State-gov,88638,Doctorate,16,Never-married,Prof-specialty,Other-relative,White,Female,0,3683,20,United-States,>50K
8,68,Federal-gov,422013,HS-grad,9,Divorced,Prof-specialty,Not-in-family,White,Female,0,3683,40,United-States,<=50K
9,41,Private,70037,Some-college,10,Never-married,Craft-repair,Unmarried,White,Male,0,3004,60,?,>50K


In [67]:
# We can see that there are 32561 instances(lines) and 15 attributes(columns) in the data set.
dataset.shape

(32561, 15)

In [68]:
# This is all columns from data set
dataset.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education.num',
       'marital.status', 'occupation', 'relationship', 'race', 'sex',
       'capital.gain', 'capital.loss', 'hours.per.week', 'native.country',
       'income'],
      dtype='object')

In [69]:
# We can see that our table has missing values "?"(for example line 0, line 2)
# We have to correct it, because if not our program will think that person 0 and person 2 
# have the same workclass which call "?" and have the same occupation also "?",
# I guess it is not true so I just change all "?" to NaN value
dataset[dataset == "?"] = np.nan
dataset.head(3)

  result = method(y)


Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,,77053,HS-grad,9,Widowed,,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,,186061,Some-college,10,Widowed,,Unmarried,Black,Female,0,4356,40,United-States,<=50K


In [70]:
# Ok now Thre is NaN 

In [84]:
# In this part of code we change NaN with the most frequent value
for col in dataset.columns:
    dataset[col].fillna(dataset[col].mode()[0], inplace=True)

In [85]:
dataset.dtypes

age                int64
workclass          int64
fnlwgt             int64
education          int64
education.num      int64
marital.status     int64
occupation         int64
relationship       int64
race               int64
sex                int64
capital.gain       int64
capital.loss       int64
hours.per.week     int64
native.country     int64
income            object
dtype: object

In [86]:
# Above we see all types of columns I think "age" don't need such big number like int64,
# so I chage to smaller number int8
dataset.astype({'age':'int8', 'education.num':'int8','hours.per.week':'int8'}).dtypes


age                 int8
workclass          int64
fnlwgt             int64
education          int64
education.num       int8
marital.status     int64
occupation         int64
relationship       int64
race               int64
sex                int64
capital.gain       int64
capital.loss       int64
hours.per.week      int8
native.country     int64
income            object
dtype: object

In [87]:
# Classify columns to numerical group (comparable values) and categorical group (non comparable group)
CATEGORICAL = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']
NUMERICAL = ['age', 'fnlwgt', 'education.num', 'capital.gain', 'capital.loss', 'hours.per.week']

In [88]:
# We do this becouse it is not comfortable to work with string values and we will hash(encrypt) it to numvers,
# the same values correspond to the same number, the difference values correspond to the different numbers
# we use function fit_transform to do this

In [89]:
for i in CATEGORICAL:
    dataset[i] = LabelEncoder().fit_transform(dataset[i])

In [90]:
# Lts's see what happen, there are only numbers now
dataset.sample(10)

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
6260,21,3,191460,1,7,4,7,4,2,0,0,0,40,38,<=50K
31324,53,5,187830,8,11,5,2,1,4,1,0,0,40,30,<=50K
24608,30,3,94413,15,10,0,13,1,4,1,0,0,30,38,<=50K
18502,25,3,50053,11,9,4,7,1,2,1,0,0,40,23,<=50K
1887,45,3,205100,9,13,2,3,0,4,1,15024,0,60,38,>50K
2207,36,3,201769,1,7,4,10,1,2,1,13550,0,40,38,>50K
18660,23,3,146399,9,13,4,9,1,4,1,0,0,55,38,<=50K
26960,39,6,171482,9,13,4,9,1,4,1,0,0,40,38,>50K
3504,73,3,242769,11,9,2,6,0,4,1,3471,0,40,8,<=50K
13009,29,5,144063,5,4,2,4,0,4,1,0,0,72,38,<=50K


In [15]:
# We do the same with 'income' but manually
dataset['income'] = dataset['income'].apply(lambda x: 0 if(x == '<=50K') else 1)


In [18]:
# look 'income' was changed
dataset.sample(5)

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
1005,24,4,276851,11,9,0,11,3,4,0,0,1762,40,39,0
3699,40,4,220977,10,16,2,4,0,1,1,3103,0,40,19,1
26991,40,4,174395,11,9,2,3,0,4,1,0,0,40,39,0
21671,34,4,191930,15,10,2,7,0,2,1,0,0,40,39,0
26126,52,4,270728,5,4,2,8,0,4,1,0,0,48,5,0


In [60]:
# Divide the table into input and output,  
X = dataset.copy() #input
y = dataset.pop('income') #output
# it is seems like function in math y(x), so we have x and must find y 

In [21]:
# Split data into separate training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, train_size=0.2)

In [22]:
X_train.shape, X_test.shape

((6512, 14), (26049, 14))

In [23]:
# I decided to do this task with random forest, so I create the object "random_forest"
random_forest = RandomForestClassifier()


In [62]:
random_forest.fit(X_train, y_train) # Build a forest of trees from the training set (X_train, y_train).

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [25]:
y_pred_class = random_forest.predict(X_test) # Predict class for X

In [26]:
# Use Accuracy Score
metrics.accuracy_score(y_test, y_pred_class)

0.8551575876233253

In [34]:
# We also can use Confusion Matrix 
conf_matrix = metrics.confusion_matrix(y_test, y_pred_class)
conf_matrix

array([[18448,  1334],
       [ 2439,  3828]])

In [38]:
# So we have confusion matrix which look like this:
#            Predicted 0   Predicted 1
#  Actual 0   18448           1334
#  Actual 1    2439           3828

tn = conf_matrix[0,0] # true negative
fp = conf_matrix[0,1] # false positive
fn = conf_matrix[1,0] # false negative
tp = conf_matrix[1,1] # true positive

In [39]:
# how often is the classifier correct
(tn+tp)/(tn+fp+fn+tp)

0.8551575876233253

In [47]:
# Aha! Have the same result. But the plus of confusion matrix is that we can calculate variety of metrix:
# For exapmle:
# When the actual value is positive, how often is the prediction correct?
print(tp/(tp+fn))
# When the actual value is negative, how often is the prediction correct?
print(tn/(tn+fp))

0.610818573480134
0.932564958042665


In [44]:
# We have the same value with accuracy_score and Confussion_matrix
# But let's see how many 1 and 0 in test set (y_test)
y_test.value_counts()

0    19782
1     6267
Name: income, dtype: int64

In [45]:
# So we have about 80% of 0, this is high imbalance 
# This means that even if we just print only 0 we will gave about 80% of correct answers
# Maybe we should use other method ???

In [46]:
# Let's try roc_auc_score
# AUC is useful even when there is high imbalance (unlike classification accuracy)
metrics.roc_auc_score(y_test, y_pred_class)

0.7716917657613996

In [None]:
# Now we see other result