# Decision Tree


In [1]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold, cross_val_score

## Data Preprocessing

In [2]:
df = pd.read_csv("../datasets/diabetes.csv")
df.head()

Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,tested_positive
1,1,85,66,29,0,26.6,0.351,31,tested_negative
2,8,183,64,0,0,23.3,0.672,32,tested_positive
3,1,89,66,23,94,28.1,0.167,21,tested_negative
4,0,137,40,35,168,43.1,2.288,33,tested_positive


In [5]:
X = df.values[:,:-1]
Y = df.values[:,-1]
print(X[:5])
print(Y[:5])

[[6 148 72 35 0 33.6 0.627 50]
 [1 85 66 29 0 26.6 0.351 31]
 [8 183 64 0 0 23.3 0.672 32]
 [1 89 66 23 94 28.1 0.167 21]
 [0 137 40 35 168 43.1 2.288 33]]
['tested_positive' 'tested_negative' 'tested_positive' 'tested_negative'
 'tested_positive']


## Building a Tree and Testing

In [6]:
# Train a Decision Tree
clf = tree.DecisionTreeClassifier(criterion="entropy")
clf = clf.fit(X,Y)
print(clf.classes_)

### Parameters of DecisionTreeClassifier
## criterion : The function to measure the quality of a split. {"gini","entropy"}
## max_depth : The maximum depth of the tree
## min_samples_split : The minimum # of samples requires to split an internal node.
## min_samples_leaf : The minimum # of samples required to be at a leaf node.

['tested_negative' 'tested_positive']


In [7]:
# Output represents the predicted label of each data
test_X = [[5,120,65,30,100,24,0.3,40]]
print(test_X)
pred_y = clf.predict(test_X)
print(pred_y)

[[5, 120, 65, 30, 100, 24, 0.3, 40]]
['tested_negative']


In [8]:
# The probability of each label can also be shown

pred_prob = clf.predict_proba(test_X)
print(pred_prob)

[[1. 0.]]


## Cross-Validation

In [9]:
cv = KFold(n_splits=10,
          shuffle=True,
          random_state=0)
cv_results = cross_val_score(clf,X,Y,cv=cv)

print(cv_results.mean())

0.7303485987696514


## Effect of Tree Size

In [10]:
clf = tree.DecisionTreeClassifier(criterion="entropy")
clf = clf.fit(X,Y)
cv = KFold(n_splits=10,
          shuffle=True,
          random_state=0)
cv_results = cross_val_score(clf,X,Y,cv=cv)

print(cv_results.mean()) # mean accuracy
print(clf.get_depth()) # classifier depth

0.7381408065618592
16


In [11]:
clf = tree.DecisionTreeClassifier(criterion="entropy", max_depth=5)
clf = clf.fit(X,Y)
cv = KFold(n_splits=10,
          shuffle=True,
          random_state=0)
cv_results = cross_val_score(clf,X,Y,cv=cv)

print(cv_results.mean()) # mean accuracy
print(clf.get_depth()) # classifier depth

0.7603041695146958
5
