<a href="https://colab.research.google.com/github/ChoHyeonJun86/machine-learning-prac/blob/main/250416_cross_validation_hyperparameter_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 교차 검증과 그리드 서치

## 검증 데이터셋

In [1]:
import pandas as pd

wine = pd.read_csv('https://bit.ly/wine-date')

### 문제 1 : wine 데이터 확인

In [3]:
# wine 처음 5개 행 데이터 확인
wine.head()

Unnamed: 0,alcohol,sugar,pH,class
0,9.4,1.9,3.51,0.0
1,9.8,2.6,3.2,0.0
2,9.8,2.3,3.26,0.0
3,9.8,1.9,3.16,0.0
4,9.4,1.9,3.51,0.0


In [4]:
# wine 전체 행의 개수 확인
print(wine.shape)

(6497, 4)


In [58]:
# wine 데이터 통계값 확인 (각 특성별 평균, 표준편차, 최소값, 최대값 등)
wine.describe()

Unnamed: 0,alcohol,sugar,pH,class
count,6497.0,6497.0,6497.0,6497.0
mean,10.491801,5.443235,3.218501,0.753886
std,1.192712,4.757804,0.160787,0.430779
min,8.0,0.6,2.72,0.0
25%,9.5,1.8,3.11,1.0
50%,10.3,3.0,3.21,1.0
75%,11.3,8.1,3.32,1.0
max,14.9,65.8,4.01,1.0


In [56]:
# 화이트 와인, 레드 와인 데이터 개수 확인
wine['class'].value_counts()

Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
1.0,4898
0.0,1599


### 데이터셋 분류

In [5]:
data = wine[['alcohol', 'sugar', 'pH']].to_numpy()
target = wine['class'].to_numpy()

In [6]:
from sklearn.model_selection import train_test_split

train_input, test_input, train_target, test_target = train_test_split(
    data, target, test_size=0.2, random_state=42)

In [7]:
sub_input, val_input, sub_target, val_target = train_test_split(  #4:1로 sub,val 각각 분배
    train_input, train_target, test_size=0.2, random_state=42)

In [8]:
print(sub_input.shape, val_input.shape)

(4157, 3) (1040, 3)


In [9]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42)
dt.fit(sub_input, sub_target)

print(dt.score(sub_input, sub_target))  #과대적합 됐음을 확인 가능
print(dt.score(val_input, val_target))

0.9971133028626413
0.864423076923077


## 교차 검증

In [10]:
from sklearn.model_selection import cross_validate

scores = cross_validate(dt, train_input, train_target)  #decisontree값 확인 가능
print(scores) #fit_time, score_time, test_time에 관한 정보 확인 가능

{'fit_time': array([0.00876737, 0.00841522, 0.0123992 , 0.01199675, 0.00812364]), 'score_time': array([0.00124454, 0.00179243, 0.00174117, 0.00128651, 0.00121021]), 'test_score': array([0.86923077, 0.84615385, 0.87680462, 0.84889317, 0.83541867])}


In [11]:
import numpy as np

print(np.mean(scores['test_score']))

0.855300214703487


In [12]:
from sklearn.model_selection import StratifiedKFold

scores = cross_validate(dt, train_input, train_target, cv=StratifiedKFold())
print(np.mean(scores['test_score']))

0.855300214703487


In [13]:
splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_validate(dt, train_input, train_target, cv=splitter)
print(np.mean(scores['test_score']))

0.8574181117533719


## 하이퍼파라미터 튜닝

In [14]:
from sklearn.model_selection import GridSearchCV  #그리드 서치에 대해서

params = {'min_impurity_decrease': [0.0001, 0.0002, 0.0003, 0.0004, 0.0005]}  #불순도 감소량 5개, 기본값 5개, 총 학습시킬 모델 25개.

In [15]:
gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1)

In [16]:
gs.fit(train_input, train_target) #최적의 값을 제시

In [17]:
dt = gs.best_estimator_
print(dt.score(train_input, train_target))

0.9615162593804117


In [18]:
print(gs.best_params_)

{'min_impurity_decrease': 0.0001}


In [19]:
print(gs.cv_results_['mean_test_score'])

[0.86819297 0.86453617 0.86492226 0.86780891 0.86761605]


In [21]:
best_index = np.argmax(gs.cv_results_['mean_test_score']) #최적의 인덱스를 찾아주는 메서드.
print(gs.cv_results_['params'][best_index])

{'min_impurity_decrease': 0.0001}


In [25]:
params = {'min_impurity_decrease': np.arange(0.0001, 0.001, 0.0001),  #0.0001부터 0.001까지 0.0001의 차를 두고
          'max_depth': range(5, 20, 1), #5부터 20까지 1의 차를 두고
          'min_samples_split': range(2, 100, 10)  #같은 맥락
          }

In [26]:
gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1)
gs.fit(train_input, train_target)

In [27]:
print(gs.best_params_)  #언제가 최적인지 출력.

{'max_depth': 14, 'min_impurity_decrease': np.float64(0.0004), 'min_samples_split': 12}


In [28]:
print(np.max(gs.cv_results_['mean_test_score']))

0.8683865773302731


In [29]:
# 교차검증 수행 시간 프린트
gs.cv_results_['mean_fit_time']

array([0.00709491, 0.0067327 , 0.00674057, ..., 0.00813155, 0.02401209,
       0.02277398])

### 랜덤 서치

In [30]:
from scipy.stats import uniform, randint

In [35]:
# 균등 분포 샘플링
rgen = randint(0, 10)
rgen.rvs(10)

array([0, 2, 5, 7, 8, 1, 0, 4, 9, 9])

In [39]:
np.unique(rgen.rvs(1000), return_counts=True) # 빈도도 함께 출력

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([ 91, 102, 100, 102, 111, 107, 115,  95,  91,  86]))

In [40]:
ugen = uniform(0, 1)
ugen.rvs(10)

array([0.26514474, 0.13975072, 0.0165501 , 0.33261147, 0.92177832,
       0.64203099, 0.66976171, 0.12532159, 0.03636383, 0.89453033])

In [42]:
params = {'min_impurity_decrease': uniform(0.0001, 0.001),  #위에서는 특정값을 쥐어준 것과 달리 여기서는 순수히 랜덤.
          'max_depth': randint(20, 50),
          'min_samples_split': randint(2, 25),
          'min_samples_leaf': randint(1, 25),
          }

In [43]:
from sklearn.model_selection import RandomizedSearchCV

rs = RandomizedSearchCV(DecisionTreeClassifier(random_state=42), params,
                        n_iter=100, n_jobs=-1, random_state=42)
rs.fit(train_input, train_target)

In [44]:
print(rs.best_params_)  #최적의 파라미터 값 출력

{'max_depth': 39, 'min_impurity_decrease': np.float64(0.00034102546602601173), 'min_samples_leaf': 7, 'min_samples_split': 13}


In [45]:
print(np.max(rs.cv_results_['mean_test_score']))

0.8695428296438884


In [46]:
dt = rs.best_estimator_

print(dt.score(test_input, test_target))

0.86


In [47]:
rs.cv_results_['mean_fit_time'] #학습 시간 나열

array([0.0071888 , 0.00726275, 0.00765214, 0.00766692, 0.00685763,
       0.00730457, 0.00677657, 0.00697722, 0.00815244, 0.00720463,
       0.00698991, 0.00785398, 0.00694909, 0.00733428, 0.00775118,
       0.00740895, 0.00734472, 0.00776758, 0.00864234, 0.00755925,
       0.00995464, 0.00681486, 0.00701675, 0.00769677, 0.00760508,
       0.00883322, 0.00724254, 0.00732327, 0.01076798, 0.00679164,
       0.00693622, 0.0074584 , 0.0065618 , 0.00755863, 0.00804458,
       0.0070262 , 0.00681224, 0.00849638, 0.00711489, 0.00673804,
       0.00689054, 0.00754848, 0.00683742, 0.00740414, 0.00712667,
       0.00671525, 0.00783067, 0.0069416 , 0.01092114, 0.01180978,
       0.0147892 , 0.0091289 , 0.00657544, 0.00701747, 0.00665359,
       0.00699048, 0.00934606, 0.00805082, 0.00667529, 0.00685863,
       0.00790277, 0.00690541, 0.00805802, 0.00765977, 0.00645671,
       0.00652509, 0.00772285, 0.00680013, 0.01085491, 0.00774937,
       0.00685716, 0.00804772, 0.01317973, 0.00674105, 0.00698

In [48]:
print(np.mean(rs.cv_results_['mean_fit_time']))

0.01025226068496704


### 결정트리 분할 옵션 변경

In [50]:
rs2 = RandomizedSearchCV(DecisionTreeClassifier(splitter='random', random_state=42), params,  #노드를 랜덤하게 분할.
                        n_iter=100, n_jobs=-1, random_state=42)
rs2.fit(train_input, train_target)

In [51]:
print(rs2.best_params_)
print(np.max(rs2.cv_results_['mean_test_score']))

dt = rs2.best_estimator_
print(dt.score(test_input, test_target))

{'max_depth': 43, 'min_impurity_decrease': np.float64(0.00011407982271508446), 'min_samples_leaf': 19, 'min_samples_split': 18}
0.8458726956392981
0.786923076923077


In [52]:
rs2.cv_results_['mean_fit_time']

array([0.00384097, 0.00614185, 0.00831838, 0.00627599, 0.00769815,
       0.01094313, 0.00715194, 0.00534053, 0.00797668, 0.00726366,
       0.0042778 , 0.00783563, 0.00371342, 0.00318108, 0.00764399,
       0.00808406, 0.00730934, 0.00557528, 0.00755448, 0.00517468,
       0.00663266, 0.00348377, 0.00732541, 0.00661926, 0.00417156,
       0.00520782, 0.00415144, 0.00712743, 0.00668402, 0.00861449,
       0.00437284, 0.00546908, 0.00639367, 0.00862708, 0.00785031,
       0.00852489, 0.00460243, 0.00817366, 0.00641761, 0.00577965,
       0.00322957, 0.00344954, 0.00551968, 0.00376182, 0.00847297,
       0.00641222, 0.0064527 , 0.00737143, 0.00460849, 0.00724301,
       0.00650496, 0.00361037, 0.00542927, 0.00462265, 0.00311584,
       0.00809779, 0.00399604, 0.0032856 , 0.0033812 , 0.00819821,
       0.00512881, 0.00507574, 0.0052331 , 0.00668049, 0.00298443,
       0.00336895, 0.00278845, 0.0038919 , 0.00561657, 0.00619874,
       0.0052031 , 0.0039547 , 0.00410652, 0.00461268, 0.00609

In [54]:
print(np.mean(rs2.cv_results_['mean_fit_time']))

0.00569858121871948


문제 2 : 위 코드가 기존 랜덤 서치 코드와 다른 점을 2가지 적어보세요.

-평균 학습 시간이 단축됨.

-50번째 셀에서 결정 트리 분할 옵션을 변경 하기 위해 [splitter='random']를 추가한 모습 확인 할 수 있음. 덕분에 평균 학습 시간이 단축됨.

(테스트 점수가 기존 랜덤 서치 코드에서보다 작다.)