In [1]:
import numpy as np
from sklearn.model_selection import KFold

In [2]:
# 연습용
x = np.array([
    [1,2], [3,4], [1,2], [3,4]
])

y = np.array([1, 2, 3, 4])

In [3]:
x

array([[1, 2],
       [3, 4],
       [1, 2],
       [3, 4]])

In [4]:
y

array([1, 2, 3, 4])

In [5]:
kf = KFold(n_splits=2)
print(kf.get_n_splits(x))
print(kf)

2
KFold(n_splits=2, random_state=None, shuffle=False)


In [6]:
for train_idx, test_idx in kf.split(x):
    print('--- idx')
    print(train_idx, test_idx)
    print('--- train data')
    print(x[train_idx])
    print('--- validation data')
    print(x[test_idx])

--- idx
[2 3] [0 1]
--- train data
[[1 2]
 [3 4]]
--- validation data
[[1 2]
 [3 4]]
--- idx
[0 1] [2 3]
--- train data
[[1 2]
 [3 4]]
--- validation data
[[1 2]
 [3 4]]


In [7]:
import pandas as pd

red_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-red.csv'
white_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-white.csv'

red_wine = pd.read_csv(red_url, sep=';')
white_wine = pd.read_csv(white_url, sep=';')

red_wine['color'] = 1.
white_wine['color'] = 0.

# concat으로 두 데이터를 합쳐줌
wine = pd.concat([red_wine, white_wine])

In [8]:
wine['taste'] = [1. if grade>5 else 0. for grade in wine['quality']]

x = wine.drop(['taste', 'quality'], axis=1)
y = wine['taste']

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import numpy as np

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=13)

# 학습시킴
wine_tree = DecisionTreeClassifier(max_depth=2, random_state=13)
wine_tree.fit(x_train, y_train)

y_pred_tr = wine_tree.predict(x_train)
y_pred_test = wine_tree.predict(x_test)

print('Train Acc : ', accuracy_score(y_train, y_pred_tr))
print('Test Acc : ', accuracy_score(y_test, y_pred_test))

Train Acc :  0.7294593034442948
Test Acc :  0.7161538461538461


In [10]:
from sklearn.model_selection import KFold

kFold = KFold(n_splits=5)
wine_tree_cv = DecisionTreeClassifier(max_depth=2, random_state=13)

In [11]:
for train_idx, test_idx in kFold.split(x):
    print(len(train_idx), len(test_idx))

5197 1300
5197 1300
5198 1299
5198 1299
5198 1299


In [12]:
cv_acc = []

for train_idx, test_idx in kFold.split(x):
    x_train = x.iloc[train_idx]
    x_test = x.iloc[test_idx]
    y_train = y.iloc[train_idx]
    y_test = y.iloc[test_idx]

    wine_tree_cv.fit(x_train, y_train)
    pred = wine_tree_cv.predict(x_test)
    cv_acc.append(accuracy_score(y_test, pred))

cv_acc

[0.6007692307692307,
 0.6884615384615385,
 0.7090069284064665,
 0.7628945342571208,
 0.7867590454195535]

In [13]:
# 평균 확인
np.mean(cv_acc)

0.709578255462782

In [14]:
from sklearn.model_selection import StratifiedKFold

skfold = StratifiedKFold(n_splits=5)
wine_tree_cv = DecisionTreeClassifier(max_depth=2, random_state=13)

cv_acc = []

for train_idx, test_idx in skfold.split(x, y):
    x_train = x.iloc[train_idx]
    x_test = x.iloc[test_idx]
    y_train = y.iloc[train_idx]
    y_test = y.iloc[test_idx]

    wine_tree_cv.fit(x_train, y_train)
    pred = wine_tree_cv.predict(x_test)
    cv_acc.append(accuracy_score(y_test, pred))

cv_acc

[0.5523076923076923,
 0.6884615384615385,
 0.7143956889915319,
 0.7321016166281755,
 0.7567359507313318]

In [15]:
np.mean(cv_acc)

0.6888004974240539

In [16]:
# 위 코드 간략화함
from sklearn.model_selection import cross_val_score

skfold = StratifiedKFold(n_splits=5)
wine_tree_cv = DecisionTreeClassifier(max_depth=2, random_state=13)

cross_val_score(wine_tree, x, y, cv=skfold)

array([0.55230769, 0.68846154, 0.71439569, 0.73210162, 0.75673595])

In [17]:
def skfold_dt(depth):
    # 위 코드 간략화함
    from sklearn.model_selection import cross_val_score

    skfold = StratifiedKFold(n_splits=5)
    wine_tree_cv = DecisionTreeClassifier(max_depth=2, random_state=13)

    print(cross_val_score(wine_tree, x, y, cv=skfold))

In [18]:
skfold_dt(5)

[0.55230769 0.68846154 0.71439569 0.73210162 0.75673595]


In [19]:
from sklearn.model_selection import cross_validate

cross_validate(wine_tree_cv, x, y, cv=skfold, return_train_score=True)

{'fit_time': array([0.00440311, 0.00420499, 0.00407577, 0.00429893, 0.00430608]),
 'score_time': array([0.00060797, 0.00063896, 0.00064301, 0.00057817, 0.00062299]),
 'test_score': array([0.55230769, 0.68846154, 0.71439569, 0.73210162, 0.75673595]),
 'train_score': array([0.74773908, 0.74696941, 0.74317045, 0.73509042, 0.73258946])}

-------------------------------------------------

# 1. 하이퍼파라미터 튜닝

In [20]:
import pandas as pd

red_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-red.csv'
white_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-white.csv'

red_wine = pd.read_csv(red_url, sep=';')
white_wine = pd.read_csv(white_url, sep=';')

red_wine['color'] = 1.
white_wine['color'] = 0.

# concat으로 두 데이터를 합쳐줌
wine = pd.concat([red_wine, white_wine])

wine['taste'] = [1. if grade>5 else 0. for grade in wine['quality']]

x = wine.drop(['taste', 'quality'], axis=1)
y = wine['taste']

In [23]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

params = {'max_depth': [2, 4, 7, 10]}

wine_tree = DecisionTreeClassifier(max_depth=2, random_state=13)

In [24]:
gridsearch = GridSearchCV(estimator=wine_tree, param_grid=params, cv=5)
gridsearch.fit(x, y)

In [25]:
import pprint

pp = pprint.PrettyPrinter(indent=4)
pp.pprint(gridsearch.cv_results_)

{   'mean_fit_time': array([0.00603261, 0.00785718, 0.01221943, 0.01671839]),
    'mean_score_time': array([0.00093584, 0.00066357, 0.00064354, 0.00066004]),
    'mean_test_score': array([0.6888005 , 0.66356523, 0.65340854, 0.64401587]),
    'param_max_depth': masked_array(data=[2, 4, 7, 10],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object),
    'params': [   {'max_depth': 2},
                  {'max_depth': 4},
                  {'max_depth': 7},
                  {'max_depth': 10}],
    'rank_test_score': array([1, 2, 3, 4], dtype=int32),
    'split0_test_score': array([0.55230769, 0.51230769, 0.50846154, 0.51615385]),
    'split1_test_score': array([0.68846154, 0.63153846, 0.60307692, 0.60076923]),
    'split2_test_score': array([0.71439569, 0.72363356, 0.68360277, 0.66743649]),
    'split3_test_score': array([0.73210162, 0.73210162, 0.73672055, 0.71054657]),
    'split4_test_score': array([0.75673595, 0.7182448 , 0.73518091, 0.7251732

In [26]:
# 최고의 성능을 가진 모델은? max_depth=2 때
gridsearch.best_estimator_

In [27]:
# 최고의 스코어는?
gridsearch.best_score_

0.6888004974240539

In [28]:
# 최고의 파라미터는?
gridsearch.best_params_

{'max_depth': 2}

In [29]:
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler

# 파이프 선언(생성) 언더바 2개 주의
estimators = [
    ('scaler', StandardScaler()),
    ('clf', DecisionTreeClassifier())
]

pipe = Pipeline(estimators)

In [30]:
param_grid = [{'clf__max_depth': [2, 4, 7, 10]}]
gridsearch = GridSearchCV(estimator=pipe, param_grid=param_grid, cv=5)
gridsearch.fit(x, y)

In [31]:
# 최고의 성능을 가진 모델은? max_depth=2 때
gridsearch.best_estimator_

In [32]:
# 최고의 스코어는?
gridsearch.best_score_

0.6888004974240539

In [33]:
import pandas as pd
# 표로 성능 결과를 정리
score_df = pd.DataFrame(gridsearch.cv_results_)
score_df[['params', 'rank_test_score', 'mean_test_score', 'std_test_score']]

Unnamed: 0,params,rank_test_score,mean_test_score,std_test_score
0,{'clf__max_depth': 2},1,0.6888,0.071799
1,{'clf__max_depth': 4},2,0.663565,0.083905
2,{'clf__max_depth': 7},3,0.656485,0.081111
3,{'clf__max_depth': 10},4,0.643863,0.082879
