## Model Selection 모듈

- Train/Test 데이터를 분리하지 않고 머신러닝 수행

In [1]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [2]:
iris = load_iris()
dt_clf = DecisionTreeClassifier()
dt_clf.fit(iris.data, iris.target)

DecisionTreeClassifier()

In [4]:
pred= dt_clf.predict(iris.data)
accuracy_score(iris.target,pred)

1.0

- cross_validate 메쏘드

In [5]:
from sklearn.model_selection import cross_validate

In [6]:
dtc = DecisionTreeClassifier()
cross_validate(dtc, iris.data, iris.target)

{'fit_time': array([0.00100231, 0.00099897, 0.        , 0.        , 0.        ]),
 'score_time': array([0., 0., 0., 0., 0.]),
 'test_score': array([0.96666667, 0.96666667, 0.9       , 0.93333333, 1.        ])}

In [9]:
dtc = DecisionTreeClassifier()
cross_validate(dtc, iris.data, iris.target, return_train_score=True)

{'fit_time': array([0.00100017, 0.00099707, 0.00100279, 0.00054717, 0.        ]),
 'score_time': array([0., 0., 0., 0., 0.]),
 'test_score': array([0.96666667, 0.96666667, 0.9       , 0.96666667, 1.        ]),
 'train_score': array([1., 1., 1., 1., 1.])}

- Train/Test 데이터 셋을 분리

In [10]:
from sklearn.model_selection import train_test_split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(
    iris.data, iris.target, test_size=0.2, random_state=2021
)

In [16]:
import pandas as pd
pd.Series(y_test).value_counts()

0    14
1    10
2     6
dtype: int64

In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    iris.data, iris.target, test_size=0.2, random_state=2021,
    stratify=iris.target
)
pd.Series(y_test).value_counts()

0    10
1    10
2    10
dtype: int64

- cross_val_score() method

In [18]:
from sklearn.model_selection import cross_val_score

In [19]:
# 성능 지표는 정확도(accuracy), 교차검증 세트는 5개
cross_val_score(dtc, iris.data, iris.target, cv=5)

array([0.96666667, 0.96666667, 0.9       , 1.        , 1.        ])

In [20]:
import numpy as np
scores = cross_val_score(dtc, iris.data, iris.target, cv=5)
np.mean(scores)

0.9666666666666668

### GridSearchCV 
- 교차 검증과 최적 하이퍼 파라미터 튜닝을 한번에

In [21]:
dtc = DecisionTreeClassifier(random_state=2021)

In [22]:
# parameter 를 Dictionary 형태로 설정
params = {
    'max_depth':[2,3,4,5],
    'min_samples_split':[2,3]
}

In [None]:
from sklearn.model_selection import GridSearchCV
grid_dtc = GridSearchCV(dtc, param_grid=params, cv=3)