<a href="https://colab.research.google.com/github/ChaeYun430/MachineLearningStudy25/blob/master/scikit_learn/cross_validation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.datasets import load_iris
# === 데이터 세트 활용 ===
iris = load_iris()
iris_feature = iris.feature_names
iris_feature_data  = iris.data
iris_label = iris.target_names
iris_label_data =  iris.target

In [None]:
# 과적합(Overfitting)
# 모델이 학습 데이터에만 과도하게 최적화되어, 실제 예측을 다른 데이터로 수행할 경우에는 예측 성능이 과도하게 떨어지는 것

# 교차 검증
# 데이터 편중을 막기 위해 별도의 여러 세트로 구성된 학습 데이터 세트와 검증 데이터 세트에서 학습과 평가를 수행하는 것

In [None]:
# ===== K 폴드 교차 검증 =====
# K개의 데이터 폴드 세트를 만들어서 K번만큼 각 폴드 세트에 학습과 검증 평가를 반복적으로 수행하는 방법
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import numpy as np

kfold = KFold(n_splits=5)
cv_accuracy = []
print(iris_feature_data.shape[0])

n_iter = 0
for train_index, test_index in kfold.split(iris_feature_data):
  X_train, X_test = iris_feature_data[train_index], iris_feature_data[test_index]
  y_train, y_test = iris_label_data[train_index], iris_label_data[test_index]

  dt_clf = DecisionTreeClassifier()
  dt_clf.fit(X_train, y_train)
  pred = dt_clf.predict(X_test)
  n_iter += 1

  accuracy = np.round(accuracy_score(y_test, pred), 4)
  train_size = X_train.shape[0]
  test_size = X_train.shape[0]
  print(n_iter, accuracy, train_size, test_size, test_index)
  cv_accuracy.append(accuracy)

print(np.mean(cv_accuracy))

In [None]:
# ===== 불균형한 분포도를 가진 레이블(결정 클래스) 데이터 집합 =====
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
import pandas as pd

iris_df = pd.DataFrame(data = iris_feature_data, columns = iris_feature)
iris_df['label'] = iris_label_data

kfold = KFold(n_splits=3)
skfold = StratifiedKFold(n_splits=3)

for train_index, test_index in kfold.split(iris_df):
  y_train = iris_df['label'].iloc[train_index]
  y_test = iris_df['label'].iloc[test_index]
  print(y_train.value_counts())


for train_index, test_index in skfold.split(iris_df, iris_df['label']):
  y_train = iris_df['label'].iloc[train_index]
  y_test = iris_df['label'].iloc[test_index]
  print(y_train.value_counts())

In [None]:
# ===== Stratified K 폴드 =====
# 원본 데이터의 레이블 분포를 먼저 고려한 뒤 이 분포와 동일하게 학습과 검증 데이터 세트를 분배
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import numpy as np

skf = StratifiedKFold(n_splits=5)
n_iter = 0

for train_index, test_index in skf.split(iris_feature_data, iris_label_data):

  X_train, X_test = iris_feature_data[train_index], iris_feature_data[test_index]
  y_train, y_test = iris_label_data[train_index], iris_label_data[test_index]

  dt_clf = DecisionTreeClassifier()
  dt_clf.fit(X_train, y_train)
  pred = dt_clf.predict(X_test)
  n_iter += 1

  accuracy = np.round(accuracy_score(y_test, pred), 4)
  train_size = X_train.shape[0]
  test_size = X_train.shape[0]
  print(n_iter, accuracy, train_size, test_size, test_index)

  cv_accuracy.append(accuracy)

print(np.mean(cv_accuracy))

In [None]:
# ===== 교차 검증 API =====
# cross_val_score(), cross_validate()
# scoring : 예측 성능 평가 지표
# cv : 교차 검증 폴드 수
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.tree import DecisionTreeClassifier
import numpy as np

scores = cross_val_score(DecisionTreeClassifier(), iris_feature_data, iris_label_data, scoring='accuracy', cv = 3)
print(np.round(scores, 4))
print(np.round(np.mean(scores), 4))

In [None]:
# ====== 교차 검증 기반 하이퍼 파라미터의 최적 값 찾기 =====
# 알고리즘에 사용되는 하이퍼 파라미터를 순차적으로 입력하면서 편리하게 최적의 파라미터를 도출
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import pandas as pd

X_train, X_test, y_train, y_test = train_test_split(iris_feature_data, iris_label_data, test_size= 0.2)

hyper_params = {'min_samples_split':[2, 3], 'max_depth':[1, 2, 3]}
grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid = hyper_params, cv = 3, refit = True)
print(type(grid_search))

grid_search.fit(X_train, y_train)
scores_dict = grid_search.cv_results_
scores_df = pd.DataFrame(scores_dict)
display(scores_df)
print(grid_search.best_params_, grid_search.best_score_)

estimator = grid_search.best_estimator_
pred = estimator.predict(X_test)
accuracy = accuracy_score(y_test, pred)
print(accuracy)