In [1]:
#와인데이터 로드
import pandas as pd

wine = pd.read_csv('https://bit.ly/wine_csv_data')
wine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   alcohol  6497 non-null   float64
 1   sugar    6497 non-null   float64
 2   pH       6497 non-null   float64
 3   class    6497 non-null   float64
dtypes: float64(4)
memory usage: 203.2 KB


In [2]:
#입력데이터와 타깃 정의
data = wine[['alcohol', 'sugar', 'pH']].to_numpy()
target = wine['class'].to_numpy()

In [3]:
#훈련데이터(70%)와 테스트데이터(30%)로 나누고,
#훈련데이터를 다시 sub_input과 val_input으로 분리

from sklearn.model_selection import train_test_split

# train/test 분리
train_input, test_input, train_target, test_target = train_test_split(data, target, test_size=0.3, random_state=42)
print(train_input.shape, test_input.shape)

# sub/val 분리
sub_input, val_input, sub_target, val_target = train_test_split(train_input, train_target, test_size=0.2, random_state=42)
print(sub_input.shape, val_input.shape)

(4547, 3) (1950, 3)
(3637, 3) (910, 3)


In [4]:
#결정트리 생성 후 cross_validate로 교차검증 test_score 출력
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate

dt = DecisionTreeClassifier(random_state=42)

scores = cross_validate(dt, sub_input, sub_target)

print(scores['test_score'])

[0.85027473 0.87087912 0.86519945 0.82530949 0.85419532]


In [5]:
#폴드를 15로 설정하여 교차검증 수행 후 test_score 출력

dt = DecisionTreeClassifier(random_state=42)

scores = cross_validate(dt, sub_input, sub_target, cv=15)

print(scores['test_score'])

[0.85185185 0.87242798 0.85185185 0.84773663 0.85596708 0.83127572
 0.90123457 0.87190083 0.84297521 0.85123967 0.8553719  0.85123967
 0.87190083 0.85123967 0.84297521]


In [6]:
#그리드 서치를 이용하여 max_depth, min_samples_leaf 최적값 탐색
from sklearn.model_selection import GridSearchCV

#탐색할 하이퍼파라메터 설정
params = {'max_depth': range(2, 21, 1), 'min_samples_leaf': range(20, 31, 2)}

dt = DecisionTreeClassifier(random_state=42)

gs = GridSearchCV(dt, params, cv=5, n_jobs=-1)

gs.fit(sub_input, sub_target)

print(gs.best_estimator_)
print(gs.best_params_)


DecisionTreeClassifier(max_depth=8, min_samples_leaf=20, random_state=42)
{'max_depth': 8, 'min_samples_leaf': 20}


In [8]:
#랜덤 서치를 이용하여 50번 샘플링
from scipy.stats import randint, uniform
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

params = {'max_depth': randint(2, 21), 'min_samples_leaf': randint(20, 31), }
gs = RandomizedSearchCV(DecisionTreeClassifier(random_state=42), params, n_iter=50, n_jobs=-1, random_state=42, cv=5)

gs.fit(sub_input, sub_target)

print(gs.best_params_)
print(np.max(gs.cv_results_['mean_test_score']))

dt = gs.best_estimator_
print(dt.score(test_input, test_target))

{'max_depth': 8, 'min_samples_leaf': 23}
0.858953701044485
0.8533333333333334
