In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl

## ggplot 스타일
mpl.style.use('ggplot')
# matplotlib 한글화
mpl.rcParams['font.family'] = 'D2coding'

In [2]:
wine = pd.read_csv('https://raw.githubusercontent.com/rickiepark/hg-mldl/master/wine.csv')

In [4]:
wine_input = wine[['alcohol','sugar','pH']].to_numpy()
wine_target = wine['class'].to_numpy()

## 훈련세트와 테스트세트 분리

In [6]:
from sklearn.model_selection import train_test_split

train_input, test_input, train_target, test_target = train_test_split(
    wine_input, wine_target, test_size = 0.2, random_state = 42) 

## 훈련세트에서 검증세트 분리

In [11]:
#실제 데이터  검증데이터
sub_input, val_input, sub_target, val_target = train_test_split(
    train_input, train_target, test_size = 0.2, random_state = 42)

In [12]:
sub_input.shape, val_input.shape

((4157, 3), (1040, 3))

## 모델 구축(DecisionTreeClassifier)

In [13]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42)
dt.fit(sub_input, sub_target)

print(dt.score(sub_input, sub_target))
print(dt.score(val_input, val_target))

0.9971133028626413
0.864423076923077


In [14]:
# 과대접합 현상**

## 교차 검증(k-fold cross validation)

In [22]:
from sklearn.model_selection import cross_validate
# 오격 교차 검증, 10격 교차검증
# 평가할 모델 객체를 매개변수로 전달한다. (검정세트를 떼어내지 않고 훈련세트 전체를 전달해야 한다.)
# cross_validate(모델객체dt, 검증데이터train_input, 정답train_target, cv = 10)

scores = cross_validate(dt, train_input, train_target) # 디폴트 = 5
scores

# fit_time   : 훈련 시간 
# score_time : 검증 시간
# test_score : 최종 점수

{'fit_time': array([0.00698185, 0.00598407, 0.00698018, 0.00498629, 0.00498581]),
 'score_time': array([0.        , 0.        , 0.00099897, 0.00099564, 0.        ]),
 'test_score': array([0.86923077, 0.84615385, 0.87680462, 0.84889317, 0.83541867])}

In [26]:
np.mean(scores['test_score'])

#과대적합 해결
#과소적합 의심

0.855300214703487

In [27]:
from sklearn.model_selection import StratifiedKFold
# cv = StratifiedKFold() 분할, splitter 

scores = cross_validate(dt, train_input, train_target, cv = StratifiedKFold()) # 디폴트 = 5
np.mean(scores['test_score'])

0.855300214703487

In [32]:
# 만약 k-fold = 10이면 (10-겹 교차검증)
# splitter= 에 담아주야함
splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=42) # 분할기 , 한번 섞어서 나옴(그냥 나오지않고)
scores = cross_validate(dt, train_input, train_target, cv = splitter)  # 디폴트 = 5
np.mean(scores['test_score'])

0.8574181117533719