<a href="https://colab.research.google.com/github/6kitty/IP01069/blob/main/IP01069_6_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **교차 검증과 그리드 서치**

# **검증 세트**

In [26]:
## 와인데이터세트를 데이터프레임으로 준비해서 numpy 변환
import pandas as pd

wine = pd.read_csv('https://bit.ly/wine-date')
data = wine[['alcohol', 'sugar', 'pH']].to_numpy()
target = wine['class'].to_numpy()

In [27]:
from sklearn.model_selection import train_test_split

train_input,test_input,train_target,test_target=train_test_split(data,target,test_size=0.2,random_state=42)

print(train_input.shape,test_input.shape)

(5197, 3) (1300, 3)


In [28]:
sub_input,val_input,sub_target,val_target=train_test_split(train_input,train_target,test_size=0.2,random_state=42)

print(sub_input.shape,val_input.shape)

(4157, 3) (1040, 3)


In [29]:
from sklearn.tree import DecisionTreeClassifier

dt=DecisionTreeClassifier(random_state=42)
dt.fit(sub_input,sub_target)

print(dt.score(sub_input,sub_target))
print(dt.score(val_input,val_target))
print(dt.score(test_input,test_target))

0.9971133028626413
0.864423076923077
0.8569230769230769


# **교차 검증**

In [30]:
from sklearn.model_selection  import cross_validate

scores=cross_validate(dt,train_input,train_target)
print(scores)

{'fit_time': array([0.01259899, 0.01208425, 0.01258898, 0.0122261 , 0.01191473]), 'score_time': array([0.0020535 , 0.00159359, 0.00161195, 0.00155663, 0.00154996]), 'test_score': array([0.86923077, 0.84615385, 0.87680462, 0.84889317, 0.83541867])}


In [31]:
import numpy as np
print(np.mean(scores['test_score']))

0.855300214703487


In [32]:
from sklearn.model_selection import StratifiedKFold

scores=cross_validate(dt,train_input,train_target,cv=splitter)
print(np.mean(scores['test_score']))

0.8574181117533719


In [33]:
from sklearn.model_selection import StratifiedKFold, cross_validate
import numpy as np

splitter=StratifiedKFold(n_splits=10,shuffle=True,random_state=42)
scores=cross_validate(dt,train_input,train_target,cv=splitter)
print(np.mean(scores['test_score']))

0.8574181117533719


# **하이퍼파라미터 튜닝**

In [34]:
from sklearn.model_selection import GridSearchCV

params={'min_impurity_decrease':[0.0001,0.0002,0.0003,0.0004,0.0005]}
gs=GridSearchCV(DecisionTreeClassifier(random_state=42),params,n_jobs=-1)
gs.fit(train_input,train_target)

In [35]:
dt=gs.best_estimator_
print(dt.score(train_input,train_target))

0.9615162593804117


In [36]:
print(gs.best_params_)

{'min_impurity_decrease': 0.0001}


In [37]:
print(gs.cv_results_['mean_test_score'])

[0.86819297 0.86453617 0.86492226 0.86780891 0.86761605]


In [38]:
best_index=np.argmax(gs.cv_results_['mean_test_score'])
print(gs.cv_results_['params'][best_index])

{'min_impurity_decrease': 0.0001}


In [39]:
params={'min_impurity_decrease':np.arange(0.0001,0.001,0.0001),'max_depth':range(5,20,1),'min_samples_split':range(2,100,10)}

In [40]:
gs=GridSearchCV(DecisionTreeClassifier(random_state=42),params,n_jobs=-1)
gs.fit(train_input,train_target)

In [41]:
print(gs.best_params_)

{'max_depth': 14, 'min_impurity_decrease': np.float64(0.0004), 'min_samples_split': 12}


In [42]:
print(np.max(gs.cv_results_['mean_test_score']))

0.8683865773302731


# **랜덤 서치**

In [44]:
from scipy.stats import uniform, randint

rgen=randint(0,10)
rgen.rvs(10)

array([9, 5, 3, 6, 1, 7, 8, 0, 7, 7])

In [46]:
np.unique(rgen.rvs(1000),return_counts=True)
ugen=uniform(0,1)
ugen.rvs(10)

array([0.26294996, 0.24733795, 0.74042599, 0.56217603, 0.42424762,
       0.56452745, 0.081195  , 0.34804406, 0.80453189, 0.78911697])

In [47]:
params={'min_impurity_decrease':uniform(0.0001,0.001),'max_depth':randint(20,50),'min_samples_split':randint(2,25),'min_samples_leaf':randint(1,25)}
from sklearn.model_selection import RandomizedSearchCV

gs=RandomizedSearchCV(DecisionTreeClassifier(random_state=42),params,n_iter=100,n_jobs=-1,random_state=42)
gs.fit(train_input,train_target)

In [48]:
print(gs.best_params_)

{'max_depth': 39, 'min_impurity_decrease': np.float64(0.00034102546602601173), 'min_samples_leaf': 7, 'min_samples_split': 13}


In [49]:
print(np.max(gs.cv_results_['mean_test_score']))

0.8695428296438884


In [50]:
dt=gs.best_estimator_
print(dt.score(test_input,test_target))

0.86


# **확인문제**
위의 RandomizedSearchCV 실습에서 DecisionTreeClassifier 클래스에 splitter="random' 메게변수를 추가하고 다시 훈련해 보세요. splitter 매개변슈의 기본값은 'best'로 각 노드에서 최선의 분활을 찾습니다. 'random'이면 무작위로 분할한 다음 가장 좋은 것을 고릅니다. 테스트 세트의 성능이 어떻게 변했는지 확인하시오.

In [51]:
params={'min_impurity_decrease':uniform(0.0001,0.001),'max_depth':randint(20,50),'min_samples_split':randint(2,25),'min_samples_leaf':randint(1,25)}
from sklearn.model_selection import RandomizedSearchCV

gs=RandomizedSearchCV(DecisionTreeClassifier(random_state=42,splitter="random"),params,n_iter=100,n_jobs=-1,random_state=42)
gs.fit(train_input,train_target)

In [52]:
print(gs.best_params_)
print(np.max(gs.cv_results_['mean_test_score']))
dt=gs.best_estimator_
print(dt.score(test_input,test_target))

{'max_depth': 43, 'min_impurity_decrease': np.float64(0.00011407982271508446), 'min_samples_leaf': 19, 'min_samples_split': 18}
0.8458726956392981
0.786923076923077
