In [1]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [5]:
iris = load_iris()
iris_data = iris.data
iris_label = iris.target
X_train, X_test, y_train, y_test = train_test_split(iris_data, iris_label, test_size = 0.4, random_state = 121)
dt_clf = DecisionTreeClassifier(random_state = 11)
dt_clf.fit(X_train, y_train)
pred = dt_clf.predict(X_test)
print('{0:.4f}'.format(accuracy_score(y_test, pred)))

0.9667


In [6]:
# 잘못된 예시
iris = load_iris()
dt_clf = DecisionTreeClassifier()
train_data = iris.data  # 독립변수, feature, 
train_label = iris.target # 종속변수, label, class, target
dt_clf.fit(train_data, train_label)
pred = dt_clf.predict(train_data)
print('예측 정확도: ', accuracy_score(train_label, pred) )

예측 정확도:  1.0


In [31]:
from sklearn.model_selection import KFold
import numpy as np

In [32]:
iris = load_iris()
features = iris.data
label = iris.target
dt_clf = DecisionTreeClassifier()
# 5개의 폴드 세트로 분할
kfold = KFold(n_splits = 5)
# 평가 결과
cv_accuracy = []

In [58]:
# 폴드별 학습, 검증 row index 반환
for train_index, test_index in kfold.split(features):
    X_train, X_test = features[train_index], features[test_index]
    y_train, y_test = label[train_index], label[test_index]
    dt_clf.fit(X_train, y_train)
    pred = dt_clf.predict(X_test)
    accuracy = np.round(accuracy_score(y_test, pred), 4)
    cv_accuracy.append(accuracy)
print(cv_accuracy)
cv_accuracy = []

[1.0, 1.0, 0.9, 0.9333, 0.8]


<hr>

## Stratified K 폴드

- 불균형한(imbalanced) 분포도를 가진 레이블(결정 class) 데이터 집합을 위한 K 폴드 방식
- 불균형한 분포도란? 레이블 데이터 집합은 특정 레이블 값이 특이하게 많거나 매우 적어서 값의 분포가 한쪽으로 치우치는 것 의미
- K 폴드가 레이블 데이터 집합이 원본 데이터 집합의 레이블 분포를 학습 및 테스트 세트에 제대로 분배하지 못하는 경우의 문제 해결
- 원리 : 원본 데이터의 레이블 분포를 먼저 고려 -> 이 분포와 동일하게 학습과 검증 데이터 세트를 분배

In [59]:
import pandas as pd

In [64]:
iris = load_iris()
iris_df = pd.DataFrame(data = iris.data, columns = iris.feature_names)
iris_df['label'] = iris.target

In [66]:
iris_df['label'].value_counts()

2    50
1    50
0    50
Name: label, dtype: int64

In [70]:
kfold = KFold(n_splits = 3)
for train_index, test_index in kfold.split(iris_df):
    label_train = iris_df['label'].iloc[train_index]
    label_test = iris_df['label'].iloc[test_index]
    print(label_train.value_counts())
    print(label_test.value_counts())

2    50
1    50
Name: label, dtype: int64
0    50
Name: label, dtype: int64
2    50
0    50
Name: label, dtype: int64
1    50
Name: label, dtype: int64
1    50
0    50
Name: label, dtype: int64
2    50
Name: label, dtype: int64


In [92]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits = 3)
cv_accuracy = []
for train_index, test_index in skf.split(iris_df, iris_df['label']):
    label_train = iris_df['label'].iloc[train_index]
    label_test = iris_df['label'].iloc[test_index]
    dt_clf.fit(X_train, y_train)
    pred = dt_clf.predict(X_test)
    accuracy = np.round(accuracy_score(y_test, pred), 4)
    cv_accuracy.append(accuracy)
print(cv_accuracy)
cv_accuracy = []

[0.7333, 0.8, 0.8333]


In [94]:
from sklearn.model_selection import cross_val_score, cross_validate
dt_clf = DecisionTreeClassifier(random_state = 100)
data = iris.data
label = iris.target
scores = cross_val_score(dt_clf, data, label, scoring = 'accuracy', cv = 5)
print(np.round(scores, 4))
print(np.round(np.mean(scores), 4))

[0.9667 0.9667 0.9    0.9333 1.    ]
0.9533


In [99]:
from sklearn.model_selection import cross_val_score, cross_validate
dt_clf = DecisionTreeClassifier(random_state = 100)
data = iris.data
label = iris.target
scores = cross_val_score(dt_clf, data, label, scoring = 'accuracy', cv = 4)
print(np.round(scores, 4))
print(np.round(np.mean(scores), 4))

[0.9737 0.9474 0.9459 0.973 ]
0.96


In [101]:
from sklearn.model_selection import GridSearchCV
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size = 0.2, random_state = 121)
dtree = DecisionTreeClassifier()
parameters = {'max_depth':[1,2,3], 'min_samples_split':[2,3]}
grid_dtree = GridSearchCV(dtree, param_grid = parameters, cv = 3, refit = True)
grid_dtree.fit(X_train, y_train)
score_df = pd.DataFrame(grid_dtree.cv_results_)
score_df[['params', 'mean_test_score', 'rank_test_score', 'split0_test_score', 'split1_test_score', 'split2_test_score']]

Unnamed: 0,params,mean_test_score,rank_test_score,split0_test_score,split1_test_score,split2_test_score
0,"{'max_depth': 1, 'min_samples_split': 2}",0.7,5,0.7,0.7,0.7
1,"{'max_depth': 1, 'min_samples_split': 3}",0.7,5,0.7,0.7,0.7
2,"{'max_depth': 2, 'min_samples_split': 2}",0.958333,3,0.925,1.0,0.95
3,"{'max_depth': 2, 'min_samples_split': 3}",0.958333,3,0.925,1.0,0.95
4,"{'max_depth': 3, 'min_samples_split': 2}",0.975,1,0.975,1.0,0.95
5,"{'max_depth': 3, 'min_samples_split': 3}",0.975,1,0.975,1.0,0.95
