# Prediction original adult data set

## Imports

Importing needed packages:

In [1]:
import pandas 
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import CategoricalNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

# turn of warnings
import warnings
warnings.filterwarnings('ignore')

## Import Data

In [2]:
train= pandas.read_csv('../cleaned/adult_data.csv', index_col=0);
test= pandas.read_csv('../cleaned/adult_test_data.csv', index_col=0);

train=train[train['Native-country']!=' Holand-Netherlands']

display(train.head())
display(test.head())

Unnamed: 0,Age,Workclass,fnlwgt,Education,Education-number,Marital-status,Occupation,Relationship,Ethnicity,Sex,Capital-gain,Capital-loss,Hours-per-week,Native-country,50k
0,39,State-gov,77516,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,38,Private,215646,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
3,53,Private,234721,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
4,28,Private,338409,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K


Unnamed: 0,Age,Workclass,fnlwgt,Education,Education-number,Marital-status,Occupation,Relationship,Ethnicity,Sex,Capital-gain,Capital-loss,Hours-per-week,Native-country,50k
1,25,Private,226802.0,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K.
2,38,Private,89814.0,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K.
3,28,Local-gov,336951.0,Assoc-acdm,12.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,>50K.
4,44,Private,160323.0,Some-college,10.0,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States,>50K.
6,34,Private,198693.0,10th,6.0,Never-married,Other-service,Not-in-family,White,Male,0.0,0.0,30.0,United-States,<=50K.


Follwing attributes will be taken as predictor:

Age, Workclass, Education, Occupation, Ethnicity, Sex, Hours-per-week and Native-country

The attribute 50k is the target attribute.

## Data preparation

First, we will extract the predictors, labels and the target for the train and for the test dataset and also apply One-Hot Encoding for categorical attributes:

In [3]:
train_predictors = train.drop(['fnlwgt','Marital-status','Relationship','Capital-gain','Capital-loss','50k'],axis=1)
train_target = train['50k'].astype('category')

test_predictors = test.drop(['fnlwgt','Marital-status','Relationship','Capital-gain','Capital-loss','50k'],axis=1)
test_target = test['50k'].astype('category')



## One-Hot Encoding

enc=np.array(['Workclass', 'Education', 'Occupation','Ethnicity','Native-country'])

train_predictors= pandas.get_dummies(train_predictors,columns=enc)
test_predictors= pandas.get_dummies(test_predictors,columns=enc)

#Saving label names for later use
labels=list(train_predictors.columns)

## Transform binary categories to 0 and 1 
train_target = train_target.map({' <=50K': 0, ' >50K':1}).astype('category')
test_target = test_target.map({' <=50K.':0, ' >50K.':1}).astype('category')
test_predictors['Sex']=test_predictors['Sex'].map({' Male': 0, ' Female':1})
train_predictors['Sex']=train_predictors['Sex'].map({' Male': 0, ' Female':1})

## Transform everythin to np.array
train_predictors=np.array(train_predictors)
train_target=np.array(train_target)

test_predictors=np.array(test_predictors)
test_target=np.array(test_target)


## Random Forest

In [4]:
classifier = RandomForestClassifier(random_state=42,n_estimators=10)
classifier = classifier.fit(train_predictors,train_target)
prediction = classifier.predict(test_predictors)
difference = test_target-prediction

In [5]:
print('Total number of predictions:' + str(len(prediction)))
print('Total number of wrong predictions:' + str(np.count_nonzero(difference))) 
print('Accuracy: '+ str(((len(prediction) - np.count_nonzero(difference))/len(prediction))*100)+' %')

Total number of predictions:15060
Total number of wrong predictions:3381
Accuracy: 77.54980079681275 %


In [6]:
feature_importances = pandas.DataFrame(classifier.feature_importances_,
                                   index = labels,
                                    columns=['importance']).sort_values('importance',ascending=False)
display(feature_importances.head(20))

Unnamed: 0,importance
Age,0.39322
Hours-per-week,0.179608
Education-number,0.073479
Sex,0.048585
Occupation_ Exec-managerial,0.027057
Occupation_ Prof-specialty,0.020601
Education_ Masters,0.015121
Workclass_ Private,0.014026
Occupation_ Other-service,0.013564
Education_ Bachelors,0.013535


## Naive Bayes

In [7]:
model = CategoricalNB()
model = model.fit(train_predictors,train_target)

naive_prediction= model.predict(test_predictors)
naive_difference = naive_prediction - test_target

In [8]:
print('Total number of predictions:' + str(len(naive_prediction)))
print('Total number of wrong predictions:' + str(np.count_nonzero(naive_difference))) 
print('Accuracy: '+ str(((len(naive_prediction) - np.count_nonzero(naive_difference))/len(naive_prediction))*100)+' %')
print("Accuracy:",metrics.accuracy_score(test_target, naive_prediction))


Total number of predictions:15060
Total number of wrong predictions:3297
Accuracy: 78.10756972111554 %
Accuracy: 0.7810756972111553


## KNN 

In [9]:
knn_model = KNeighborsClassifier(n_neighbors=3)
knn_model.fit(train_predictors, train_target)

knn_prediction=knn_model.predict(test_predictors)
knn_difference = knn_prediction - test_target

In [10]:
print('Total number of predictions:' + str(len(knn_prediction)))
print('Total number of wrong predictions:' + str(np.count_nonzero(knn_difference))) 
print('Accuracy: '+ str(((len(knn_prediction) - np.count_nonzero(knn_difference))/len(knn_prediction))*100)+' %')
print("Accuracy:",metrics.accuracy_score(test_target, knn_prediction))


Total number of predictions:15060
Total number of wrong predictions:3568
Accuracy: 76.30810092961488 %
Accuracy: 0.7630810092961487


## Voting Classifier

In [11]:
voting_pred=[]
i=0
while i < len(prediction):
    tmp=prediction[i]+naive_prediction[i]+knn_prediction[i]
    if tmp>1:
        voting_pred.append(1)
    else:
        voting_pred.append(0)
    i += 1
voting_diff=voting_pred-test_target

In [12]:
print('Total number of predictions:' + str(len(voting_pred)))
print('Total number of wrong predictions:' + str(np.count_nonzero(voting_diff))) 
print('Accuracy: '+ str(((len(knn_prediction) - np.count_nonzero(voting_diff))/len(voting_pred))*100)+' %')
print("Accuracy:",metrics.accuracy_score(test_target, voting_pred))


Total number of predictions:15060
Total number of wrong predictions:3202
Accuracy: 78.73837981407702 %
Accuracy: 0.7873837981407702
