<a href="https://colab.research.google.com/github/Antique-1/machine-learning-practice/blob/main/250416_cross_validation_hyperparameter_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 교차 검증과 그리드 서치

## 검증 데이터셋

In [8]:
import pandas as pd

wine = pd.read_csv('https://bit.ly/wine-date')

### 문제 1 : wine 데이터 확인

In [9]:
# wine 처음 5개 행 데이터 확인
wine.head(5)

Unnamed: 0,alcohol,sugar,pH,class
0,9.4,1.9,3.51,0.0
1,9.8,2.6,3.2,0.0
2,9.8,2.3,3.26,0.0
3,9.8,1.9,3.16,0.0
4,9.4,1.9,3.51,0.0


In [10]:
# wine 전체 행의 개수 확인
print(wine.shape[0])

6497


In [11]:
# wine 데이터 통계값 확인 (각 특성별 평균, 표준편차, 최소값, 최대값 등)
wine.describe()

Unnamed: 0,alcohol,sugar,pH,class
count,6497.0,6497.0,6497.0,6497.0
mean,10.491801,5.443235,3.218501,0.753886
std,1.192712,4.757804,0.160787,0.430779
min,8.0,0.6,2.72,0.0
25%,9.5,1.8,3.11,1.0
50%,10.3,3.0,3.21,1.0
75%,11.3,8.1,3.32,1.0
max,14.9,65.8,4.01,1.0


In [12]:
# 화이트 와인, 레드 와인 데이터 개수 확인
wine['class'].value_counts()

Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
1.0,4898
0.0,1599


### 데이터셋 분류

In [13]:
data = wine[['alcohol', 'sugar', 'pH']].to_numpy()
target = wine['class'].to_numpy()

In [14]:
from sklearn.model_selection import train_test_split

train_input, test_input, train_target, test_target = train_test_split(
    data, target, test_size=0.2, random_state=42)

In [15]:
sub_input, val_input, sub_target, val_target = train_test_split(
    train_input, train_target, test_size=0.2, random_state=42)

In [16]:
print(sub_input.shape, val_input.shape)

(4157, 3) (1040, 3)


In [17]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42)
dt.fit(sub_input, sub_target)

print(dt.score(sub_input, sub_target))
print(dt.score(val_input, val_target))

0.9971133028626413
0.864423076923077


## 교차 검증

In [18]:
from sklearn.model_selection import cross_validate

scores = cross_validate(dt, train_input, train_target)
print(scores)

{'fit_time': array([0.00724936, 0.00722766, 0.00714803, 0.00667858, 0.00644064]), 'score_time': array([0.00100517, 0.00090694, 0.00081253, 0.00074935, 0.00081372]), 'test_score': array([0.86923077, 0.84615385, 0.87680462, 0.84889317, 0.83541867])}


In [19]:
import numpy as np

print(np.mean(scores['test_score']))

0.855300214703487


In [20]:
from sklearn.model_selection import StratifiedKFold

scores = cross_validate(dt, train_input, train_target, cv=StratifiedKFold())
print(np.mean(scores['test_score']))

0.855300214703487


In [21]:
splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_validate(dt, train_input, train_target, cv=splitter)
print(np.mean(scores['test_score']))

0.8574181117533719


## 하이퍼파라미터 튜닝

In [22]:
from sklearn.model_selection import GridSearchCV

params = {'min_impurity_decrease': [0.0001, 0.0002, 0.0003, 0.0004, 0.0005]}

In [23]:
gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1)

In [24]:
gs.fit(train_input, train_target)

In [25]:
dt = gs.best_estimator_
print(dt.score(train_input, train_target))

0.9615162593804117


In [26]:
print(gs.best_params_)

{'min_impurity_decrease': 0.0001}


In [27]:
print(gs.cv_results_['mean_test_score'])

[0.86819297 0.86453617 0.86492226 0.86780891 0.86761605]


In [28]:
best_index = np.argmax(gs.cv_results_['mean_test_score'])
print(gs.cv_results_['params'][best_index])

{'min_impurity_decrease': 0.0001}


In [29]:
params = {'min_impurity_decrease': np.arange(0.0001, 0.001, 0.0001),
          'max_depth': range(5, 20, 1),
          'min_samples_split': range(2, 100, 10)
          }

In [30]:
gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1)
gs.fit(train_input, train_target)

In [31]:
print(gs.best_params_)

{'max_depth': 14, 'min_impurity_decrease': np.float64(0.0004), 'min_samples_split': 12}


In [32]:
print(np.max(gs.cv_results_['mean_test_score']))

0.8683865773302731


In [33]:
# 교차검증 수행 시간 프린트
gs.cv_results_['mean_fit_time']

array([0.00546331, 0.00543985, 0.00539107, ..., 0.00559845, 0.00631938,
       0.0091639 ])

### 랜덤 서치

In [34]:
from scipy.stats import uniform, randint

In [35]:
# 균등 분포 샘플링
rgen = randint(0, 10)
rgen.rvs(10) # 랜덤 변수 10개 서치

array([6, 8, 7, 9, 3, 1, 1, 7, 0, 8])

In [36]:
np.unique(rgen.rvs(1000), return_counts=True) # 빈도도 함께 출력

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([103,  94, 120, 106,  87, 104, 107,  84,  90, 105]))

In [37]:
ugen = uniform(0, 1)
ugen.rvs(10)

array([0.97973293, 0.77963758, 0.46139004, 0.91262369, 0.90249142,
       0.26161316, 0.42125247, 0.70097318, 0.12293211, 0.52620795])

In [38]:
params = {'min_impurity_decrease': uniform(0.0001, 0.001),
          'max_depth': randint(20, 50),
          'min_samples_split': randint(2, 25),
          'min_samples_leaf': randint(1, 25),
          }

In [39]:
from sklearn.model_selection import RandomizedSearchCV

rs = RandomizedSearchCV(DecisionTreeClassifier(random_state=42), params,
                        n_iter=100, n_jobs=-1, random_state=42)
rs.fit(train_input, train_target)

In [40]:
print(rs.best_params_)

{'max_depth': 39, 'min_impurity_decrease': np.float64(0.00034102546602601173), 'min_samples_leaf': 7, 'min_samples_split': 13}


In [41]:
print(np.max(rs.cv_results_['mean_test_score']))

0.8695428296438884


In [42]:
dt = rs.best_estimator_

print(dt.score(test_input, test_target))

0.86


In [43]:
rs.cv_results_['mean_fit_time']

array([0.00579276, 0.00563636, 0.00600638, 0.00616851, 0.0062665 ,
       0.00635934, 0.00620899, 0.00588169, 0.00592031, 0.00623589,
       0.0061131 , 0.00571961, 0.00622144, 0.00625625, 0.00644217,
       0.00798726, 0.00586047, 0.0059411 , 0.00663395, 0.00586138,
       0.00677738, 0.0055347 , 0.00581064, 0.00573859, 0.00651379,
       0.00620871, 0.00570488, 0.00571456, 0.00570664, 0.00542541,
       0.00539002, 0.00559425, 0.00540652, 0.006252  , 0.00637112,
       0.00567341, 0.00545363, 0.00645533, 0.00543966, 0.00585971,
       0.00561161, 0.00595937, 0.00714302, 0.00589352, 0.0057004 ,
       0.00594606, 0.00553946, 0.00556607, 0.00803542, 0.00558152,
       0.00564814, 0.00614443, 0.00541544, 0.00523114, 0.00571175,
       0.00579877, 0.00555019, 0.00615177, 0.00572925, 0.00581231,
       0.00642309, 0.00550489, 0.0066052 , 0.00555401, 0.00554872,
       0.00532212, 0.00530972, 0.00619335, 0.00591397, 0.00562339,
       0.00551496, 0.00643353, 0.00546861, 0.00556192, 0.00592

In [44]:
print(np.mean(rs.cv_results_['mean_fit_time']))

0.005908131122589112


### 결정트리 분할 옵션 변경

In [50]:
rs2 = RandomizedSearchCV(DecisionTreeClassifier(splitter='random', random_state=42), params,
                        n_iter=100, n_jobs=-1, random_state=42)
rs2.fit(train_input, train_target)

In [46]:
print(rs2.best_params_)
print(np.max(rs2.cv_results_['mean_test_score']))

dt = rs2.best_estimator_
print(dt.score(test_input, test_target))

{'max_depth': 43, 'min_impurity_decrease': np.float64(0.00011407982271508446), 'min_samples_leaf': 19, 'min_samples_split': 18}
0.8458726956392981
0.786923076923077


In [47]:
rs2.cv_results_['mean_fit_time']

array([0.00265741, 0.00276523, 0.00265326, 0.00253754, 0.00242929,
       0.00255284, 0.00253596, 0.00281119, 0.00288696, 0.00255208,
       0.00272665, 0.00240579, 0.00258784, 0.00289712, 0.00252023,
       0.00256562, 0.0025579 , 0.00272098, 0.0042768 , 0.00257931,
       0.00275145, 0.00254426, 0.0025948 , 0.00257869, 0.00378013,
       0.00349913, 0.00251284, 0.00269003, 0.00272026, 0.00252914,
       0.00234962, 0.00268555, 0.00232902, 0.0028976 , 0.00402961,
       0.0025682 , 0.00248671, 0.00394349, 0.00314813, 0.00251594,
       0.00596032, 0.00270267, 0.00256205, 0.00392928, 0.004     ,
       0.00258703, 0.00267606, 0.00249081, 0.00255594, 0.00254154,
       0.00401082, 0.00248661, 0.00246511, 0.00230851, 0.00241756,
       0.00263028, 0.00261288, 0.00265579, 0.00254188, 0.00250201,
       0.00271692, 0.00246472, 0.00249224, 0.00254531, 0.00249515,
       0.00236731, 0.00237088, 0.00267763, 0.00249906, 0.00248237,
       0.0025022 , 0.00255547, 0.00252051, 0.00248976, 0.00259

In [48]:
print(np.mean(rs2.cv_results_['mean_fit_time']))

0.0027825565338134768


문제 2 : 위 코드가 기존 랜덤 서치 코드와 다른 점을 2가지 적어보세요.

기존의 코드와 다르게 splitter='random' 이라는 코드를 추가하여 결정트리의 조건을 무시하고 랜덤으로 분할하도록 되어있음.

print(np.mean(rs2.cv_results_['mean_fit_time'])) 코드를 통해 모델 평균 학습시간을 계산한 결과 별다른 조건 없이 랜덤으로 실행하기 때문에 기존보다 빠르게 학습하는 결과가 나타난다.