In [1]:
import pandas as pd

wine = pd.read_csv('https://bit.ly/wine-date')

wine.head()
wine.info()
wine.describe()

data = wine[['alcohol','sugar','pH']].to_numpy()
target = wine['class'].to_numpy()


#훈련세트:60%, 검증세트(매개 변수 튜닝을 위한 것):20%, 테스트세트:20%
#서비스할때에는 target이 없다. 예측할 뿐

#모델훈련시 데이터가 많을수록 좋다
#best param를 찾아서 최종훈련을 한다.

from sklearn.model_selection import train_test_split

train_input, test_input, train_target, test_target = train_test_split(data, target, test_size=0.2, random_state=42)

#validation set
sub_input, val_input, sub_target, val_target = train_test_split(train_input, train_target, test_size=0.2, random_state=42)

from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=42)
dt.fit(sub_input, sub_target)
print(dt.score(sub_input, sub_target))
print(dt.score(val_input, val_target))


from sklearn.model_selection import cross_validate
scores = cross_validate(dt, train_input, train_target)
print(scores)

import numpy as np
print(np.mean(scores['test_score']))

#검증세트, 훈련세트(DL) => 교차검증(ML) 일반적으로 나눌 수 있다.

#데이터를 읽어 들이는 도구들이 발달됨 : Tensorflow, keras
#데이터를 읽어 검증되는 도구들이 발달 : scikit-learn

#분할기를 사용한 교차검증
from sklearn.model_selection import StratifiedKFold
scores = cross_validate(dt, train_input, train_target, cv=StratifiedKFold())
print(np.mean(scores['test_score']))

splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_validate(dt, train_input, train_target, cv=splitter)
print(np.mean(scores['test_score']))

#grid-search
from sklearn.model_selection import GridSearchCV

#불순도의 최소값을 지정
params={'min_impurity_decrease':[0.0001,0.0002,0.0003,0.0004,0.0005]}

#params={'min_impurity_decrease':np.range(0.0001,0.01,0.0001),
#        'max_depth':range(5,20,1),
#        'min_sample_split':range(2,100,10)}

# n_jobs=-1 시스템에 있는 CPU의 모든 코어를 사용
gs = GridSearchCV(DecisionTreeClassifier(random_state=42),params, n_jobs=-1)
gs.fit(train_input, train_target)

dt = gs.best_estimator_
print(dt.score(train_input, train_target))

print(gs.best_estimator_)

print(gs.cv_results_['mean_test_score'])

best_index = np.argmax(gs.cv_results_['mean_test_score'])
print(gs.cv_results_['params'][best_index])

params={'min_impurity_decrease':np.arange(0.0001, 0.001, 0.0001),
        'max_depth':range(5,20,1),
        'min_samples_split':range(2,100,10)
}

gs = GridSearchCV(DecisionTreeClassifier(random_state=42),params, n_jobs=-1)
gs.fit(train_input, train_target)

print(gs.best_params_)
print(np.max(gs.cv_results_['mean_test_score']))


#확률분포선택
from scipy.stats import uniform, randint

rgen = randint(0,10)
rgen.rvs(10)

np.unique(rgen.rvs(1000), return_counts=True)

ugen = uniform(0,1)
ugen.rvs(10)

#random-search
params = {'min_impurity_decrease': uniform(0.0001, 0.001),
          'max_depth': randint(20, 50),
          'min_samples_split': randint(2, 25),
          'min_samples_leaf': randint(1, 25),
          }

from sklearn.model_selection import RandomizedSearchCV
rs = RandomizedSearchCV(DecisionTreeClassifier(splitter='random',random_state=42), params, 
                        n_iter=100, n_jobs=-1, random_state=42)
rs.fit(train_input, train_target)

print(rs.best_estimator_)

print(np.max(rs.cv_results_['mean_test_score']))
dt = rs.best_estimator_

print(dt.score(test_input, test_target))


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   alcohol  6497 non-null   float64
 1   sugar    6497 non-null   float64
 2   pH       6497 non-null   float64
 3   class    6497 non-null   float64
dtypes: float64(4)
memory usage: 203.2 KB
0.9971133028626413
0.864423076923077
{'fit_time': array([0.00745893, 0.00708222, 0.00738406, 0.0072403 , 0.00734854]), 'score_time': array([0.00077701, 0.00070429, 0.00067973, 0.00068378, 0.00073385]), 'test_score': array([0.86923077, 0.84615385, 0.87680462, 0.84889317, 0.83541867])}
0.855300214703487
0.855300214703487
0.8574181117533719
0.9615162593804117
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0001, min_impurity_split=None,
                       min_sample