In [2]:
from random import random
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import pandas as pd
import pydot # need to install

# prepare the iris dataset
df = pd.read_csv('data/liver.csv')
print(df.head())
print(df.columns) # column names

df_X = df.loc[:, df.columns != 'category']
df_y = df['category']

train_X, test_X, train_y, test_y = train_test_split(df_X, df_y, test_size=0.3, random_state=1234)

model = DecisionTreeClassifier(random_state=1234)
model.fit(train_X, train_y)
model.predict(test_X)

   category  mcv  alkphos  sgpt  sgot  gammagt  drinks
0         0   85       64    59    32       23     0.0
1         0   86       54    33    16       54     0.0
2         0   91       78    34    24       36     0.0
3         0   87       70    12    28       10     0.0
4         0   98       55    13    17       17     0.0
Index(['category', 'mcv', 'alkphos', 'sgpt', 'sgot', 'gammagt', 'drinks'], dtype='object')


array([1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0,
       0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0], dtype=int64)

In [3]:
# Accuracy, performance evaluation

print('Train accuracy :',model.score(train_X, train_y))
print('Test accuracy :',model.score(test_X, test_y))

Train accuracy : 1.0
Test accuracy : 0.6442307692307693


#### -결과값을 보면 test accuracy 가 상대적으로 너무 낮음. 즉 과적합(overfitting)이 일어남.
##### 이를 해결하기 위해서 조율모수 max_depth를 설정함으로써 decision tree의 최대 깊이를 설정. 이를 통해 overfitting 해소.

In [4]:
model = DecisionTreeClassifier(random_state=1234, max_depth=4) # max_depth = 4 >> 조율 모수, Decision Tree의 최대 깊이 설정
model.fit(train_X, train_y)
pred_y = model.predict(test_X)


In [5]:
print('Train accuracy :',model.score(train_X, train_y))
print('Test accuracy :',model.score(test_X, test_y))

Train accuracy : 0.8008298755186722
Test accuracy : 0.6923076923076923


##### 이전과 비교해서 accuray 0.05 가량 증가폭을 보임. overfitting 해소.

In [1]:
# export_graphviz(model, out_file='tree_model.dot', feature_names = train_X.columns, class_names = 'category', rounded = True, proportion = False, precision = 2, filled = True)
# (graph,) = pydot.graph_from_dot_file('tree_model.dot', encoding='UTF-8')
# graph.write_png('decision_tree.png')