# Prediction original adult data set

## Imports

Importing needed packages:

In [11]:
import pandas 
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import CategoricalNB
from sklearn import metrics

# turn of warnings
import warnings
warnings.filterwarnings('ignore')

## Import Data

In [12]:
train= pandas.read_csv('../cleaned/adult_data.csv', index_col=0);
test= pandas.read_csv('../cleaned/adult_test_data.csv', index_col=0);

train=train[train['Native-country']!=' Holand-Netherlands']

display(train.head())
display(test.head())

Unnamed: 0,Age,Workclass,fnlwgt,Education,Education-number,Marital-status,Occupation,Relationship,Ethnicity,Sex,Capital-gain,Capital-loss,Hours-per-week,Native-country,50k
0,39,State-gov,77516,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,38,Private,215646,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
3,53,Private,234721,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
4,28,Private,338409,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K


Unnamed: 0,Age,Workclass,fnlwgt,Education,Education-number,Marital-status,Occupation,Relationship,Ethnicity,Sex,Capital-gain,Capital-loss,Hours-per-week,Native-country,50k
1,50,Self-emp-not-inc,226802.0,Bachelors,7.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,40.0,United-States,<=50K
2,38,Private,89814.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,50.0,United-States,<=50K
3,53,Private,336951.0,11th,12.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
4,28,Private,160323.0,Bachelors,10.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,7688.0,0.0,40.0,Cuba,<=50K
5,37,Private,103497.0,Masters,10.0,Married-civ-spouse,Exec-managerial,Wife,White,Female,0.0,0.0,30.0,United-States,<=50K


Follwing attributes will be taken as predictor:

Age, Workclass, Education, Occupation, Ethnicity, Sex, Hours-per-week and Native-country

The attribute 50k is the target attribute.

## Data preparation

First, we will extract the predictors, labels and the target for the train and for the test dataset and also apply One-Hot Encoding for categorical attributes:

In [13]:
train_predictors = train.drop(['fnlwgt','Education-number','Marital-status','Relationship','Capital-gain','Capital-loss','50k'],axis=1)
train_target = train['50k'].astype('category')

test_predictors = test.drop(['fnlwgt','Education-number','Marital-status','Relationship','Capital-gain','Capital-loss','50k'],axis=1)
test_target = test['50k'].astype('category')


## saving the labels
labels= list(train_predictors.columns)

## One-Hot Encoding
train_predictors= pandas.get_dummies(train_predictors)
test_predictors= pandas.get_dummies(test_predictors)

#New Labels
labels2=list(train_predictors.columns)

## Transform targets
train_target = train_target.map({' <=50K': 0, ' >50K':1}).astype('category')
test_target = test_target.map({' <=50K':0, ' >50K':1}).astype('category')

## Transform everythin to np.array
train_predictors=np.array(train_predictors)
train_target=np.array(train_target)

test_predictors=np.array(test_predictors)
test_target=np.array(test_target)


## Random Forest

In [14]:
classifier = RandomForestClassifier(random_state=42)
classifier = classifier.fit(train_predictors,train_target)
prediction = classifier.predict(test_predictors)
difference = test_target-prediction

In [15]:
print('Total number of predictions:' + str(len(prediction)))
print('Total number of wrong predictions:' + str(np.count_nonzero(difference))) 
print('Accuracy: '+ str(((len(prediction) - np.count_nonzero(difference))/len(prediction))*100)+' %')

Total number of predictions:15076
Total number of wrong predictions:2595
Accuracy: 82.78721146192623 %


In [16]:
feature_importances = pandas.DataFrame(classifier.feature_importances_,
                                   index = labels2,
                                    columns=['importance']).sort_values('importance',ascending=False)
display(feature_importances.head(20))

Unnamed: 0,importance
Age,0.395989
Hours-per-week,0.183064
Occupation_ Exec-managerial,0.030902
Sex_ Female,0.026392
Education_ Bachelors,0.025674
Occupation_ Prof-specialty,0.024386
Sex_ Male,0.022948
Education_ Masters,0.021892
Education_ HS-grad,0.016009
Education_ Prof-school,0.015002


## Naive Bayes

In [17]:
model = CategoricalNB()
model = model.fit(train_predictors,train_target)

naive_prediction= model.predict(test_predictors)
naive_difference = naive_prediction - test_target

In [18]:
print('Total number of predictions:' + str(len(naive_prediction)))
print('Total number of wrong predictions:' + str(np.count_nonzero(naive_difference))) 
print('Accuracy: '+ str(((len(naive_prediction) - np.count_nonzero(naive_difference))/len(naive_prediction))*100)+' %')
print("Accuracy:",metrics.accuracy_score(test_target, naive_prediction))


Total number of predictions:15076
Total number of wrong predictions:3281
Accuracy: 78.23693287344123 %
Accuracy: 0.7823693287344123
