# Model Selection 모듈

- Train/Test 데이터를 분리하지 않고 머신러닝 수행

In [7]:
from sklearn.datasets import load_iris 
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [17]:
iris = load_iris()
dt_clf = DecisionTreeClassifier()
dt_clf.fit(iris.data, iris.target)

DecisionTreeClassifier()

In [18]:
pred = dt_clf.predict(iris.data)
#                        x값
accuracy_score(iris.target, pred)
#정확도         y값        예측값

#정확도 100% = 과적합 = 매우 안좋음

1.0

- cross_validate 메쏘드

In [21]:
from sklearn.model_selection import cross_validate

In [22]:
dtc = DecisionTreeClassifier()
cross_validate(dtc, iris.data, iris.target)
#             estimator    X      y

{'fit_time': array([0.0016799 , 0.00071001, 0.00088882, 0.00064206, 0.00061727]),
 'score_time': array([0.00038314, 0.00043607, 0.00043416, 0.00026894, 0.00041175]),
 'test_score': array([0.96666667, 0.96666667, 0.9       , 0.96666667, 1.        ])}

In [23]:
dtc = DecisionTreeClassifier()
cross_validate(dtc, iris.data, iris.target, return_train_score = True)
#https://zngsup.tistory.com/48

{'fit_time': array([0.00160003, 0.0016489 , 0.00103807, 0.00174069, 0.0008049 ]),
 'score_time': array([0.00053406, 0.00044107, 0.00034094, 0.00057697, 0.0005312 ]),
 'test_score': array([0.96666667, 0.96666667, 0.9       , 1.        , 1.        ]),
 'train_score': array([1., 1., 1., 1., 1.])}

- Train/Test 데이터 셋을 분리

In [26]:
from sklearn.model_selection import train_test_split

In [29]:
X_train, X_test, y_train, y_test = train_test_split(
    iris.data, iris.target, train_size=0.2, random_state=1
#데이터(x,독립변수,feature) / 결과값(y,종속변수,target) / 전체데이터에서 학습데이터의 비율
)

In [31]:
import pandas as pd 
pd.Series(y_test).value_counts()
# 갯수의 균형이 맞지 않는다.

2    43
0    40
1    37
dtype: int64

- Stratified 분리

In [36]:
train, X_test, y_train, y_test = train_test_split(
    iris.data, iris.target, train_size=0.2, random_state=1,
    stratify=iris.target
)

pd.Series(y_test).value_counts()

# 갯수가 맞아짐
# 불균형한 분포를 가진 데이터 집합을 균형하게 나눠줌

2    40
1    40
0    40
dtype: int64

- cross_val_score() method

In [37]:
from sklearn.model_selection import cross_val_score

In [39]:
# 성능 지표는 정확도(accuracy), 교차검증 세트는 5개
cross_val_score(dtc, iris.data, iris.target, cv=5)

array([0.96666667, 0.96666667, 0.9       , 0.96666667, 1.        ])

In [40]:
import numpy as np
scores = cross_val_score(dtc, iris.data, iris.target, cv=5)
np.mean(scores) #cross_val_score 평균

0.9600000000000002

### GridSearchCV - 가장 활용도가 높고, 많이 사용되고, 중요 !! - 암기하기
- 교차 검증과 최적 하이퍼 파라메터 튜닝을 한꺼번에 수행

In [44]:
dtc = DecisionTreeClassifier(random_state=1)

In [48]:
# parameter 를 dictionary 형태로 설정
params = {
    'max_depth' : [2,3,4,5],
    'min_samples_split' : [2,3]
}

In [49]:
from sklearn.model_selection import GridSearchCV
grid_dtc = GridSearchCV(dtc, param_grid= params, cv=3)

In [50]:
grid_dtc.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=DecisionTreeClassifier(random_state=1),
             param_grid={'max_depth': [2, 3, 4, 5],
                         'min_samples_split': [2, 3]})

In [51]:
grid_dtc.cv_results_

{'mean_fit_time': array([0.00160042, 0.00174864, 0.00137703, 0.0009733 , 0.00108822,
        0.00071565, 0.00068831, 0.00104125]),
 'std_fit_time': array([1.46383355e-04, 2.65704596e-04, 8.15299359e-05, 9.68230606e-05,
        1.16383818e-04, 3.34949585e-05, 2.07875845e-05, 4.17980039e-04]),
 'mean_score_time': array([0.00052134, 0.00069626, 0.00074776, 0.00033426, 0.00039697,
        0.00030271, 0.00026464, 0.00043805]),
 'std_score_time': array([9.55836177e-05, 9.25247048e-05, 2.44101066e-04, 4.40324111e-05,
        6.74360815e-05, 4.94385068e-05, 1.10824041e-05, 2.27742067e-04]),
 'param_max_depth': masked_array(data=[2, 2, 3, 3, 4, 4, 5, 5],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_min_samples_split': masked_array(data=[2, 3, 2, 3, 2, 3, 2, 3],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'ma

In [53]:
## 그래서 여기서 중요한 것

# 최적 파라미터
grid_dtc.best_params_

{'max_depth': 2, 'min_samples_split': 2}

In [56]:
# 최고 정확도
grid_dtc.best_score_

0.7666666666666666

In [60]:
# 최고 정확도를 가지는 최적 파라미터로 학습한 estimator
best_estimator = grid_dtc.best_estimator_
best_estimator.score(X_test, y_test)

0.95