# Decision Tree

In [32]:
import pandas as pd
import numpy as np
import graphviz
from sklearn.preprocessing import OneHotEncoder
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold, cross_val_score

## Data Preprocessing

In [33]:
df = pd.read_csv("../datasets/weather.nominal.csv")
df.head()

Unnamed: 0,outlook,temperature,humidity,windy,play
0,sunny,hot,high,False,no
1,sunny,hot,high,True,no
2,overcast,hot,high,False,yes
3,rainy,mild,high,False,yes
4,rainy,cool,normal,False,yes


In [34]:
X = df.values[:,:-1]
y = df.values[:,-1]

In [35]:
# OneHotEncoder

enc = OneHotEncoder(handle_unknown="ignore")
enc.fit(X)
en_X = enc.transform(X).toarray()
print(X)
print(en_X)

[['sunny' 'hot' 'high' False]
 ['sunny' 'hot' 'high' True]
 ['overcast' 'hot' 'high' False]
 ['rainy' 'mild' 'high' False]
 ['rainy' 'cool' 'normal' False]
 ['rainy' 'cool' 'normal' True]
 ['overcast' 'cool' 'normal' True]
 ['sunny' 'mild' 'high' False]
 ['sunny' 'cool' 'normal' False]
 ['rainy' 'mild' 'normal' False]
 ['sunny' 'mild' 'normal' True]
 ['overcast' 'mild' 'high' True]
 ['overcast' 'hot' 'normal' False]
 ['rainy' 'mild' 'high' True]]
[[0. 0. 1. 0. 1. 0. 1. 0. 1. 0.]
 [0. 0. 1. 0. 1. 0. 1. 0. 0. 1.]
 [1. 0. 0. 0. 1. 0. 1. 0. 1. 0.]
 [0. 1. 0. 0. 0. 1. 1. 0. 1. 0.]
 [0. 1. 0. 1. 0. 0. 0. 1. 1. 0.]
 [0. 1. 0. 1. 0. 0. 0. 1. 0. 1.]
 [1. 0. 0. 1. 0. 0. 0. 1. 0. 1.]
 [0. 0. 1. 0. 0. 1. 1. 0. 1. 0.]
 [0. 0. 1. 1. 0. 0. 0. 1. 1. 0.]
 [0. 1. 0. 0. 0. 1. 0. 1. 1. 0.]
 [0. 0. 1. 0. 0. 1. 0. 1. 0. 1.]
 [1. 0. 0. 0. 0. 1. 1. 0. 0. 1.]
 [1. 0. 0. 0. 1. 0. 0. 1. 1. 0.]
 [0. 1. 0. 0. 0. 1. 1. 0. 0. 1.]]


In [36]:
# Check the encoded result
categories = []
for x in enc.categories_:
    categories += list(x)
print(categories)

['overcast', 'rainy', 'sunny', 'cool', 'hot', 'mild', 'high', 'normal', False, True]


## Building a Tree and Testing

In [37]:
clf = tree.DecisionTreeClassifier(criterion="entropy")
clf = clf.fit(en_X,y)
print(clf.classes_)

['no' 'yes']


In [38]:
test = [["overcast","mild","high",True]]
en_test = enc.transform(test).toarray()
print(en_test)
pred_y = clf.predict(en_test)
print(pred_y)
pred_prob = clf.predict_proba(en_test)
print(pred_prob)

[[1. 0. 0. 0. 0. 1. 1. 0. 0. 1.]]
['yes']
[[0. 1.]]


## Cross-validation

In [39]:
cv = KFold(n_splits=10,
          shuffle=True,
          random_state=0)
cv_results = cross_val_score(clf,en_X,y,cv=cv)

print(cv_results.mean())

0.55


## Visualize Tree

In [28]:
dot_data = tree.export_graphviz(clf, out_file=None,
                               feature_names = categories,
                               class_names = clf.classes_,
                               filled=True,
                               special_characters = True)
graph = graphviz.Source(dot_data)
graph

### Parameters
## feature_names : the list of attributes of data
## class_names : the list of lavels of data
## filled : graphviz colors the node if set True
## special_characters : show the special characters (i.e., <=)

ExecutableNotFound: failed to execute PosixPath('dot'), make sure the Graphviz executables are on your systems' PATH

<graphviz.sources.Source at 0x7faa59c2adf0>

## Effect of Tree Size

In [49]:
clf = tree.DecisionTreeClassifier(criterion="entropy")
# clf.fit(en_X,y)
cv = KFold(n_splits=10,
          shuffle=True,
          random_state=0)
cv_results = cross_val_score(clf,en_X,y,cv=cv)

print(cv_results.mean()) # mean accuracy
# print(clf.get_depth()) # classifier depth

0.5


In [50]:
clf = tree.DecisionTreeClassifier(criterion="entropy", max_depth=3)
# clf = clf.fit(en_X,y)
cv = KFold(n_splits=10,
          shuffle=True,
          random_state=0)
cv_results = cross_val_score(clf,en_X,y,cv=cv)

print(cv_results.mean()) # mean accuracy
# print(clf.get_depth()) # classifier depth

0.55
