# Prediction anonymized adult data set

## Imports

Importing needed packages:

In [1]:
import pandas 
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import CategoricalNB
from sklearn import metrics

# turn of warnings
import warnings
warnings.filterwarnings('ignore')

## Import Data

In [2]:
train= pandas.read_csv('../Anonymized_finished/train.csv', index_col=0);
test= pandas.read_csv('../Anonymized_finished/test.csv', index_col=0);

display(train.head())
display(test.head())

Unnamed: 0,Age,Workclass,fnlwgt,Education,Education-number,Marital-status,Occupation,Relationship,Ethnicity,Sex,Capital-gain,Capital-loss,Hours-per-week,Native-country,50k
0,"[40, 60[","{Federal-gov, Local-gov, State-gov}",92141.0,"{Assoc-acdm, Assoc-voc}","[1, 17[","{Divorced, Windowed, Never-married}",Adm-clerical,Not-in-family,Black,Female,0.0,0.0,40.0,United-States,<=50K
1,"[40, 60[","{Federal-gov, Local-gov, State-gov}",131302.0,"{Assoc-acdm, Assoc-voc}","[1, 17[","{Divorced, Windowed, Never-married}",Adm-clerical,Not-in-family,Black,Female,0.0,0.0,44.0,United-States,<=50K
2,"[40, 60[","{Federal-gov, Local-gov, State-gov}",139161.0,"{Assoc-acdm, Assoc-voc}","[1, 17[","{Divorced, Windowed, Never-married}",Adm-clerical,Not-in-family,Black,Female,0.0,1741.0,40.0,United-States,<=50K
3,"[20, 40[","{Federal-gov, Local-gov, State-gov}",255830.0,"{Assoc-acdm, Assoc-voc}","[1, 17[","{Divorced, Windowed, Never-married}",Adm-clerical,Own-child,Black,Female,0.0,0.0,45.0,United-States,<=50K
4,"[20, 40[","{Federal-gov, Local-gov, State-gov}",272986.0,"{Assoc-acdm, Assoc-voc}","[1, 17[","{Divorced, Windowed, Never-married}",Adm-clerical,Own-child,Black,Female,0.0,0.0,8.0,United-States,<=50K


Unnamed: 0,Age,Workclass,fnlwgt,Education,Education-number,Marital-status,Occupation,Relationship,Ethnicity,Sex,Capital-gain,Capital-loss,Hours-per-week,Native-country,50k
0,"[20, 40[","{Federal-gov, Local-gov, State-gov]",205737.0,"{9th, 10th, 11th, 12th, HS-grad}","[1, 17[","{Divorced, Widowed, Never-married}",Adm-clerical,Unmarried,Black,Female,0.0,0.0,40.0,United-States,<=50K
1,"[20, 40[","{Federal-gov, Local-gov, State-gov]",334291.0,"{9th, 10th, 11th, 12th, HS-grad}","[1, 17[","{Divorced, Widowed, Never-married}",Adm-clerical,Unmarried,Black,Female,0.0,0.0,40.0,United-States,<=50K
2,"[20, 40[","{Federal-gov, Local-gov, State-gov]",138162.0,"{9th, 10th, 11th, 12th, HS-grad}","[1, 17[","{Divorced, Widowed, Never-married}",Adm-clerical,Unmarried,Black,Female,0.0,0.0,45.0,United-States,<=50K
3,"[40, 60[","{Federal-gov, Local-gov, State-gov]",186934.0,"{9th, 10th, 11th, 12th, HS-grad}","[1, 17[","{Divorced, Widowed, Never-married}",Adm-clerical,Unmarried,Black,Female,0.0,0.0,50.0,United-States,<=50K
4,"[40, 60[","{Federal-gov, Local-gov, State-gov]",116219.0,"{9th, 10th, 11th, 12th, HS-grad}","[1, 17[","{Divorced, Widowed, Never-married}",Adm-clerical,Unmarried,Black,Female,0.0,0.0,40.0,United-States,<=50K


Follwing attributes will be taken as predictor:

Age, Workclass, Education, Occupation, Ethnicity, Sex, Hours-per-week and Native-country

The attribute 50k is the target attribute.

## Data preparation

First, we will extract the predictors, labels and the target for the train and for the test dataset and also apply One-Hot Encoding for categorical attributes:

In [3]:
train_predictors = train.drop(['fnlwgt','Education-number','Marital-status','Relationship','Capital-gain','Capital-loss','50k'],axis=1)
train_target = train['50k'].astype('category')

test_predictors = test.drop(['fnlwgt','Education-number','Marital-status','Relationship','Capital-gain','Capital-loss','50k'],axis=1)
test_target = test['50k'].astype('category')


## saving the labels
labels= list(train_predictors.columns)

## One-Hot Encoding

train_predictors= pandas.get_dummies(train_predictors)
test_predictors= pandas.get_dummies(test_predictors)

labels2=list(train_predictors.columns)
## Transform targets
for c in train_target.cat.categories:
    print(c)
train_target = train_target.map({'<=50K':0, '>50K':1}).astype('category')
test_target = test_target.map({'<=50K':0, '>50K':1}).astype('category')
for c in train_target.cat.categories:
    print(c)
## Transform everythin to np.array
train_predictors=np.array(train_predictors)
train_target=np.array(train_target)

test_predictors=np.array(test_predictors)
test_target=np.array(test_target)



<=50K
>50K
0
1


## Random Forest

In [4]:
classifier = RandomForestClassifier(random_state=42)
classifier = classifier.fit(train_predictors,train_target)

prediction = classifier.predict(test_predictors)

difference = prediction - test_target

In [5]:
print('Total number of predictions:' + str(len(prediction)))
print('Total number of wrong predictions:' + str(np.count_nonzero(difference))) 
print('Accuracy: '+ str(((len(prediction) - np.count_nonzero(difference))/len(prediction))*100)+' %')


Total number of predictions:11231
Total number of wrong predictions:2488
Accuracy: 77.84703054046834 %


In [6]:
feature_importances = pandas.DataFrame(classifier.feature_importances_,
                                   index = labels2,
                                    columns=['importance']).sort_values('importance',ascending=False)
display(feature_importances.head(20))

Unnamed: 0,importance
Hours-per-week,0.324865
"Education_{Bachelors, Masters}",0.066435
"Age_[40, 60[",0.064502
Occupation_Exec-managerial,0.05005
Sex_Female,0.047325
"Education_{9th, 10th, 11th, 12th, HS-grad}",0.046631
Occupation_Prof-specialty,0.042889
Sex_Male,0.04212
"Age_[20, 40[",0.036502
Occupation_Other-service,0.019428


## Naive Bayes

In [7]:
model = CategoricalNB()
model = model.fit(train_predictors,train_target)

naive_prediction= model.predict(test_predictors)
naive_difference = naive_prediction - test_target

In [8]:
print('Total number of predictions:' + str(len(naive_prediction)))
print('Total number of wrong predictions:' + str(np.count_nonzero(naive_difference))) 
print('Accuracy: '+ str(((len(naive_prediction) - np.count_nonzero(naive_difference))/len(naive_prediction))*100)+' %')
print("Accuracy:",metrics.accuracy_score(test_target, naive_prediction))


Total number of predictions:11231
Total number of wrong predictions:2578
Accuracy: 77.04567714362034 %
Accuracy: 0.7704567714362034
