# 의사결정나무 실습

##### 
---

#####  8.7.5 wine 데이터를 활용하여 wine 종류를 구분하는 실습

# Dataset import
## 데이터 불러오기

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn import datasets
raw_wine = datasets.load_wine()

In [3]:
# 데이터 셋 내 피처 살펴보기
raw_wine.feature_names

['alcohol',
 'malic_acid',
 'ash',
 'alcalinity_of_ash',
 'magnesium',
 'total_phenols',
 'flavanoids',
 'nonflavanoid_phenols',
 'proanthocyanins',
 'color_intensity',
 'hue',
 'od280/od315_of_diluted_wines',
 'proline']

# 피처, 타깃 데이터 지정

In [4]:
X = raw_wine.data
y = raw_wine.target

## 트레이닝, 테스트 데이터 분할

In [5]:
from sklearn.model_selection import train_test_split
X_tn, X_te, y_tn, y_te = train_test_split(X, y, random_state = 1)

## 데이터 표준화

In [6]:
from sklearn.preprocessing import StandardScaler
std_scale = StandardScaler()
std_scale.fit(X_tn)                               # X 트레이닝 데이터 기준으로 std scaler fitting, Xtn Xte모두.
X_tn_std = std_scale.transform(X_tn)
X_te_std = std_scale.transform(X_te)

# 의사결정나무 분류모델


## 모델링
### 데이터 학습

In [8]:
from sklearn import tree          # Tree 모형 import
clf_tree = tree.DecisionTreeClassifier(random_state = 0)
clf_tree.fit(X_tn_std, y_tn)

DecisionTreeClassifier(random_state=0)

### 데이터 예측

In [9]:
tree_pred = clf_tree.predict(X_te_std)
print(tree_pred)

[2 1 0 1 0 2 1 0 2 1 0 0 1 0 1 1 2 0 1 0 0 1 2 1 0 2 0 0 0 2 1 2 2 0 1 1 2
 0 1 0 0 1 2 0 0]


## 정확도 평가

f1 스코어를 확인해본다.

In [11]:
from sklearn.metrics import f1_score

f1 = f1_score(y_te, tree_pred, average = 'macro')
f1

0.9542846417846418

다른 것들도 확인해볼까.

In [12]:
from sklearn.metrics import recall_score

recall = recall_score(y_te, tree_pred, average = 'macro')
recall

0.9607843137254902

In [13]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_te, tree_pred)
accuracy

0.9555555555555556

틀린게 몇가지 있는 것으로 보인다.

### confusion matrix 확인

confusion matrix를 통해 예측값과 실제값의 일치 정도를 확인해본다.

In [14]:
from sklearn.metrics import confusion_matrix

conf_matrix = confusion_matrix(y_te, tree_pred)
conf_matrix

array([[18,  0,  0],
       [ 1, 15,  1],
       [ 0,  0, 10]])

### 분류 리포트 확인


In [15]:
from sklearn.metrics import classification_report

class_rep = classification_report(y_te, tree_pred)
print(class_rep)

              precision    recall  f1-score   support

           0       0.95      1.00      0.97        18
           1       1.00      0.88      0.94        17
           2       0.91      1.00      0.95        10

    accuracy                           0.96        45
   macro avg       0.95      0.96      0.95        45
weighted avg       0.96      0.96      0.95        45

