### Importing necessary libraries

In [1]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
import os
import matplotlib.pylab as plt
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import sklearn.metrics

from sklearn import tree
from io import StringIO
from IPython.display import Image
import pydotplus



### Data Engineering

In [19]:
#Load the dataset
data = pd.read_csv("adult.data.txt", header=None)
columns = [
    'age', 'workclass', 'fnlwgt', 'education', 
    'education-num', 'marital-status', 'occupation', 
    'relationship', 'race', 'sex', 'capital-gain', 
    'capital-loss', 'hours-per-week', 'native-country',
    'pred_age'
]
data.columns = columns

In [20]:
# No empty data
# data_clean = data.dropna()

In [21]:
data.head(5) # First 5 rows

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,pred_age
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [22]:
# Deleting age column because it simplifies classification too much
data.drop(['age'], inplace=True, axis=1)

### Processing categorical variables

In [23]:
# Change str values for categorical variable to int values

# Already transformed to int representation in column education-num
data.drop(['education'], inplace=True, axis=1)

categorical_vars = (
    'workclass', 'marital-status', 'occupation',
    'relationship', 'race', 'sex', 'native-country'
)

# Transforming representation of categorical variables
for column in categorical_vars:
    data[column] = data[column].astype('category')
    data.insert(0, "{}_cat".format(column), data[column].cat.codes)
    data.drop([column], inplace=True, axis=1)

# First 5 rows
data.head(5)

Unnamed: 0,native-country_cat,sex_cat,race_cat,relationship_cat,occupation_cat,marital-status_cat,workclass_cat,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,pred_age
0,39,1,4,1,1,4,7,77516,13,2174,0,40,<=50K
1,39,1,4,0,4,2,6,83311,13,0,0,13,<=50K
2,39,1,4,1,6,0,4,215646,9,0,0,40,<=50K
3,39,1,2,0,6,2,4,234721,7,0,0,40,<=50K
4,5,0,2,5,10,2,4,338409,13,0,0,40,<=50K


In [24]:
data.dtypes

native-country_cat      int8
sex_cat                 int8
race_cat                int8
relationship_cat        int8
occupation_cat          int8
marital-status_cat      int8
workclass_cat           int8
fnlwgt                 int64
education-num          int64
capital-gain           int64
capital-loss           int64
hours-per-week         int64
pred_age              object
dtype: object

In [25]:
data.describe()

Unnamed: 0,native-country_cat,sex_cat,race_cat,relationship_cat,occupation_cat,marital-status_cat,workclass_cat,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,36.718866,0.669205,3.665858,1.446362,6.57274,2.611836,3.868892,189778.4,10.080679,1077.648844,87.30383,40.437456
std,7.823782,0.470506,0.848806,1.606771,4.228857,1.506222,1.45596,105550.0,2.57272,7385.292085,402.960219,12.347429
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12285.0,1.0,0.0,0.0,1.0
25%,39.0,0.0,4.0,0.0,3.0,2.0,4.0,117827.0,9.0,0.0,0.0,40.0
50%,39.0,1.0,4.0,1.0,7.0,2.0,4.0,178356.0,10.0,0.0,0.0,40.0
75%,39.0,1.0,4.0,3.0,10.0,4.0,4.0,237051.0,12.0,0.0,0.0,45.0
max,41.0,1.0,4.0,5.0,14.0,6.0,8.0,1484705.0,16.0,99999.0,4356.0,99.0


### Modelling and Prediction

In [26]:
# Split into training and testing sets
predictors = data.iloc[:,:-1]
targets = data.iloc[:,-1]
pred_train, pred_test, tar_train, tar_test = train_test_split(predictors, targets, test_size=.4)

In [27]:
# Training data
print(pred_train.shape)
print(tar_train.shape)

# Testing data
print(pred_test.shape)
print(tar_test.shape)

(19536, 12)
(19536,)
(13025, 12)
(13025,)


### Building model on training data

In [28]:
classifier=DecisionTreeClassifier()
classifier=classifier.fit(pred_train,tar_train)
predictions=classifier.predict(pred_test)

### Testing model

In [29]:
sklearn.metrics.confusion_matrix(tar_test, predictions)

array([[8624, 1331],
       [1175, 1895]])

In [30]:
sklearn.metrics.accuracy_score(tar_test, predictions)

0.80760076775431866

In [45]:
tree.export_graphviz(classifier, out_file="graph1.dot")

### Rendering image via Terminal
My Python Notebook hasn't managed to render this image. Rendering was done via Terminal.app.<br>
Command: `dot -Tpng graph1.dot -o graph1.png`<br><br>
Image of this graph is located in the same derictory as this codebook.

### Building new model on less features (prunning tree)

In [58]:
# Using less features
new_predictors = data[['race_cat', 'workclass_cat', 'education-num']]
new_pred_train, new_pred_test, new_tar_train, new_tar_test = train_test_split(new_predictors, targets, test_size=.4)

# Building new model
new_classifier=DecisionTreeClassifier()
new_classifier=classifier.fit(new_pred_train, new_tar_train)
new_predictions=classifier.predict(new_pred_test)

In [59]:
sklearn.metrics.confusion_matrix(new_tar_test, new_predictions)

array([[9452,  425],
       [2455,  693]])

In [60]:
sklearn.metrics.accuracy_score(new_tar_test, new_predictions)

0.77888675623800385

In [61]:
tree.export_graphviz(classifier, out_file="graph2.dot")