# Prediction anonymized adult data set

## Imports

Importing needed packages:

In [13]:
import pandas 
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import CategoricalNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.metrics import classification_report

# turn of warnings
import warnings
warnings.filterwarnings('ignore')

## Import Data

In [14]:
train= pandas.read_csv('../Anonymized_finished/train02.csv', index_col=0);
test= pandas.read_csv('../Anonymized_finished/test02.csv', index_col=0);

display(train.head())
display(test.head())

Unnamed: 0,Age,Workclass,fnlwgt,Education,Education-number,Marital-status,Occupation,Relationship,Ethnicity,Sex,Capital-gain,Capital-loss,Hours-per-week,Native-country,50k
0,"[40, 60[","{Federal-gov, Local-gov, State-gov}",92141.0,"{Assoc-acdm, Assoc-voc}","[10, 15[","{Divorced, Windowed, Never-married}",Adm-clerical,Not-in-family,Black,Female,0.0,0.0,40.0,United-States,<=50K
1,"[40, 60[","{Federal-gov, Local-gov, State-gov}",131302.0,"{Assoc-acdm, Assoc-voc}","[10, 15[","{Divorced, Windowed, Never-married}",Adm-clerical,Not-in-family,Black,Female,0.0,0.0,44.0,United-States,<=50K
2,"[40, 60[","{Federal-gov, Local-gov, State-gov}",139161.0,"{Assoc-acdm, Assoc-voc}","[10, 15[","{Divorced, Windowed, Never-married}",Adm-clerical,Not-in-family,Black,Female,0.0,1741.0,40.0,United-States,<=50K
3,"[20, 40[","{Federal-gov, Local-gov, State-gov}",255830.0,"{Assoc-acdm, Assoc-voc}","[10, 15[","{Divorced, Windowed, Never-married}",Adm-clerical,Own-child,Black,Female,0.0,0.0,45.0,United-States,<=50K
4,"[20, 40[","{Federal-gov, Local-gov, State-gov}",272986.0,"{Assoc-acdm, Assoc-voc}","[10, 15[","{Divorced, Windowed, Never-married}",Adm-clerical,Own-child,Black,Female,0.0,0.0,8.0,United-States,<=50K


Unnamed: 0,Age,Workclass,fnlwgt,Education,Education-number,Marital-status,Occupation,Relationship,Ethnicity,Sex,Capital-gain,Capital-loss,Hours-per-week,Native-country,50k
0,"[40, 60[","{Federal-gov, Local-gov, State-gov}",96652.0,"{Assoc-acdm, Assoc-voc}","[10, 15[","{Married-AF-spouse, Married-civ-spouse, Marrie...",Adm-clerical,Unmarried,Black,Female,0.0,0.0,40.0,United-States,<=50K.
1,"[40, 60[","{Federal-gov, Local-gov, State-gov}",265386.0,"{Assoc-acdm, Assoc-voc}","[10, 15[","{Married-AF-spouse, Married-civ-spouse, Marrie...",Adm-clerical,Unmarried,Black,Female,0.0,0.0,40.0,United-States,<=50K.
2,"[40, 60[","{Federal-gov, Local-gov, State-gov}",47270.0,"{Assoc-acdm, Assoc-voc}","[10, 15[","{Married-AF-spouse, Married-civ-spouse, Marrie...",Adm-clerical,Unmarried,Black,Female,0.0,0.0,40.0,United-States,<=50K.
3,"[20, 40[","{Federal-gov, Local-gov, State-gov}",90872.0,"{Prof-school, Some-college}","[10, 15[","{Divorced, Widowed, Never-married}",Adm-clerical,Not-in-family,Black,Female,0.0,0.0,40.0,United-States,<=50K.
4,"[20, 40[","{Federal-gov, Local-gov, State-gov}",243177.0,"{Prof-school, Some-college}","[10, 15[","{Divorced, Widowed, Never-married}",Adm-clerical,Not-in-family,Black,Female,0.0,0.0,40.0,United-States,<=50K.


Follwing attributes will be taken as predictor:

Age, Workclass, Education, Occupation, Ethnicity, Sex, Hours-per-week and Native-country

The attribute 50k is the target attribute.

## Data preparation

First, we will extract the predictors, labels and the target for the train and for the test dataset and also apply One-Hot Encoding for categorical attributes:

In [15]:
#Removing Rows with Laos because test predictors does not contain it -> Once Hot encoding 
train= train[train['Native-country']!='Laos']

train_predictors = train.drop(['fnlwgt','Marital-status','Relationship','Capital-gain','Capital-loss','50k'],axis=1)
train_target = train['50k'].astype('category')

test_predictors = test.drop(['fnlwgt','Marital-status','Relationship','Capital-gain','Capital-loss','50k'],axis=1)
test_target = test['50k'].astype('category')

## saving the labels
labels= list(train_predictors.columns)


## One-Hot Encoding
enc=np.array(["Age","Workclass","Education","Education-number","Occupation","Ethnicity","Native-country"])

train_predictors= pandas.get_dummies(train_predictors,columns=enc)
test_predictors= pandas.get_dummies(test_predictors,columns=enc)


#New Labels
labels2=list(train_predictors.columns)
labels3=list(test_predictors.columns)

## Transform targets
train_target = train_target.map({'<=50K': 0, '>50K':1}).astype('category')
test_target = test_target.map({'<=50K.':0, '>50K.':1}).astype('category')
test_predictors['Sex']=test_predictors['Sex'].map({'Male': 0, 'Female':1})
train_predictors['Sex']=train_predictors['Sex'].map({'Male': 0, 'Female':1})

## Transform everythin to np.array
train_predictors=np.array(train_predictors)
train_target=np.array(train_target)

test_predictors=np.array(test_predictors)
test_target=np.array(test_target)


## Random Forest
Train & predict with model

In [16]:
classifier = RandomForestClassifier(random_state=42)
classifier = classifier.fit(train_predictors,train_target)

prediction = classifier.predict(test_predictors)

difference = prediction - test_target

In [17]:
feature_importances = pandas.DataFrame(classifier.feature_importances_,
                                   index = labels2,
                                    columns=['importance']).sort_values('importance',ascending=False)
display(feature_importances.head(20))

Unnamed: 0,importance
Hours-per-week,0.319428
Sex,0.085941
"Age_[40, 60[",0.06471
"Education_{Bachelors, Masters}",0.057433
Occupation_Exec-managerial,0.041749
"Age_[20, 40[",0.038319
"Education-number_[15, 17[",0.028911
Occupation_Prof-specialty,0.02809
"Education-number_[5, 10[",0.026618
"Education-number_[10, 15[",0.022995


#### Prediction

In [18]:
print('Total number of predictions:' + str(len(prediction)))
print('Total number of wrong predictions:' + str(np.count_nonzero(difference))) 
print('Accuracy: '+ str(((len(prediction) - np.count_nonzero(difference))/len(prediction))*100)+' %')


Total number of predictions:12077
Total number of wrong predictions:2681
Accuracy: 77.80077833899148 %


In [19]:
# Creating the confusion matrix
cm = metrics.confusion_matrix( test_target,prediction)
# Assigning columns names
cm_df = pandas.DataFrame(cm, 
            columns = ['Predicted Negative', 'Predicted Positive'],
            index = ['Actual Negative', 'Actual Positive'])
# Showing the confusion matrix
cm_df


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,7836,1022
Actual Positive,1659,1560


In [20]:
#metrics
print(classification_report(test_target, prediction,target_names=['0','1']))

              precision    recall  f1-score   support

           0       0.83      0.88      0.85      8858
           1       0.60      0.48      0.54      3219

    accuracy                           0.78     12077
   macro avg       0.71      0.68      0.70     12077
weighted avg       0.77      0.78      0.77     12077



## Naive Bayes
Train & predict with model

In [21]:
model = CategoricalNB()
model = model.fit(train_predictors,train_target)

naive_prediction= model.predict(test_predictors)
naive_difference = naive_prediction - test_target

#### Prediction

In [22]:
print('Total number of predictions:' + str(len(naive_prediction)))
print('Total number of wrong predictions:' + str(np.count_nonzero(naive_difference))) 
print('Accuracy: '+ str(((len(naive_prediction) - np.count_nonzero(naive_difference))/len(naive_prediction))*100)+' %')
print("Accuracy:",metrics.accuracy_score(test_target, naive_prediction))


Total number of predictions:12077
Total number of wrong predictions:2892
Accuracy: 76.0536557091993 %
Accuracy: 0.7605365570919931


In [23]:
# Creating the confusion matrix
cm = metrics.confusion_matrix( test_target,naive_prediction)
# Assigning columns names
cm_df = pandas.DataFrame(cm, 
            columns = ['Predicted Negative', 'Predicted Positive'],
            index = ['Actual Negative', 'Actual Positive'])
# Showing the confusion matrix
cm_df


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,7105,1753
Actual Positive,1139,2080


In [24]:
#metrics
print(classification_report(test_target, naive_prediction,target_names=['0','1']))

              precision    recall  f1-score   support

           0       0.86      0.80      0.83      8858
           1       0.54      0.65      0.59      3219

    accuracy                           0.76     12077
   macro avg       0.70      0.72      0.71     12077
weighted avg       0.78      0.76      0.77     12077



## KNN 
Train & predict with model

In [25]:
knn_model = KNeighborsClassifier(n_neighbors=3)
knn_model.fit(train_predictors, train_target)

knn_prediction=knn_model.predict(test_predictors)
knn_difference = knn_prediction - test_target

#### Prediction

In [26]:
print('Total number of predictions:' + str(len(knn_prediction)))
print('Total number of wrong predictions:' + str(np.count_nonzero(knn_difference))) 
print('Accuracy: '+ str(((len(knn_prediction) - np.count_nonzero(knn_difference))/len(knn_prediction))*100)+' %')
print("Accuracy:",metrics.accuracy_score(test_target, knn_prediction))


Total number of predictions:12077
Total number of wrong predictions:3023
Accuracy: 74.96894924236152 %
Accuracy: 0.7496894924236152


In [27]:
# Creating the confusion matrix
cm = metrics.confusion_matrix( test_target,knn_prediction)
# Assigning columns names
cm_df = pandas.DataFrame(cm, 
            columns = ['Predicted Negative', 'Predicted Positive'],
            index = ['Actual Negative', 'Actual Positive'])
# Showing the confusion matrix
cm_df


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,7694,1164
Actual Positive,1859,1360


In [28]:
#metrics
print(classification_report(test_target, knn_prediction,target_names=['0','1']))

              precision    recall  f1-score   support

           0       0.81      0.87      0.84      8858
           1       0.54      0.42      0.47      3219

    accuracy                           0.75     12077
   macro avg       0.67      0.65      0.65     12077
weighted avg       0.73      0.75      0.74     12077



## Voting Classifier
Train & predict with model

In [29]:
voting_pred=[]
i=0
while i < len(prediction):
    tmp=prediction[i]+naive_prediction[i]+knn_prediction[i]
    if tmp>1:
        voting_pred.append(1)
    else:
        voting_pred.append(0)
    i += 1
voting_diff=voting_pred-test_target

#### Prediction

In [30]:
print('Total number of predictions:' + str(len(voting_pred)))
print('Total number of wrong predictions:' + str(np.count_nonzero(voting_diff))) 
print('Accuracy: '+ str(((len(knn_prediction) - np.count_nonzero(voting_diff))/len(voting_pred))*100)+' %')
print("Accuracy:",metrics.accuracy_score(test_target, voting_pred))


Total number of predictions:12077
Total number of wrong predictions:2676
Accuracy: 77.84217934917612 %
Accuracy: 0.7784217934917612


In [31]:
# Creating the confusion matrix
cm = metrics.confusion_matrix( test_target,voting_pred)
# Assigning columns names
cm_df = pandas.DataFrame(cm, 
            columns = ['Predicted Negative', 'Predicted Positive'],
            index = ['Actual Negative', 'Actual Positive'])
# Showing the confusion matrix
cm_df


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,7754,1104
Actual Positive,1572,1647


In [32]:
#metrics
print(classification_report(test_target, voting_pred,target_names=['0','1']))

              precision    recall  f1-score   support

           0       0.83      0.88      0.85      8858
           1       0.60      0.51      0.55      3219

    accuracy                           0.78     12077
   macro avg       0.72      0.69      0.70     12077
weighted avg       0.77      0.78      0.77     12077

