In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv('day6_data1.csv')
data

Unnamed: 0,alcohol,sugar,pH,class
0,9.4,1.9,3.51,0.0
1,9.8,2.6,3.20,0.0
2,9.8,2.3,3.26,0.0
3,9.8,1.9,3.16,0.0
4,9.4,1.9,3.51,0.0
...,...,...,...,...
6492,11.2,1.6,3.27,1.0
6493,9.6,8.0,3.15,1.0
6494,9.4,1.2,2.99,1.0
6495,12.8,1.1,3.34,1.0


In [2]:
X = data[['alcohol', 'sugar', 'pH']].to_numpy()
Y = data['class'].to_numpy()

In [3]:
from sklearn.model_selection import train_test_split

t_x, tt_x, t_y, tt_y = train_test_split(X, Y, random_state= 42, test_size= 0.2)  # 검증 데이터 분류

In [4]:
s_t_x, v_t_x, s_t_y, v_t_y = train_test_split(t_x, t_y, random_state= 42, test_size= 0.2)  # 학습, 테스트 데이터 분류

In [5]:
s_t_x.shape, tt_x.shape, v_t_x.shape

((4157, 3), (1300, 3), (1040, 3))

In [6]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state= 42)
dt.fit(s_t_x, s_t_y)
dt.score(s_t_x, s_t_y), dt.score(v_t_x, v_t_y)

(0.9971133028626413, 0.864423076923077)

In [7]:
# 교차 검증
from sklearn.model_selection import cross_validate

sc = cross_validate(dt, t_x, t_y)
sc

{'fit_time': array([0.00798273, 0.01097369, 0.00698042, 0.00598264, 0.00498724]),
 'score_time': array([0.00198913, 0.00099277, 0.0009985 , 0.        , 0.        ]),
 'test_score': array([0.86923077, 0.84615385, 0.87680462, 0.84889317, 0.83541867])}

In [8]:
np.mean(sc['test_score'])

0.855300214703487

In [9]:
from sklearn.model_selection import StratifiedKFold

sc1 = cross_validate(dt, t_x, t_y, cv=StratifiedKFold())
np.mean(sc1['test_score'])

0.855300214703487

In [10]:
# 10폴드 교차 검증
sc_ck = StratifiedKFold(n_splits= 10, shuffle= True, random_state= 42)
sc2 = cross_validate(dt, t_x, t_y, cv=sc_ck)
np.mean(sc2['test_score'])

0.8574181117533719

In [11]:
from sklearn.model_selection import GridSearchCV

params = {'min_impurity_decrease': [0.0001, 0.0002, 0.0003, 0.0004, 0.0005]}

In [12]:
gs = GridSearchCV(DecisionTreeClassifier(random_state= 42), params)
gs.fit(t_x, t_y)

GridSearchCV(estimator=DecisionTreeClassifier(random_state=42),
             param_grid={'min_impurity_decrease': [0.0001, 0.0002, 0.0003,
                                                   0.0004, 0.0005]})

In [13]:
dt = gs.best_estimator_
dt.score(t_x, t_y), dt.score(tt_x, tt_y)

(0.9615162593804117, 0.8653846153846154)

In [14]:
gs.best_params_

{'min_impurity_decrease': 0.0001}

In [15]:
gs.cv_results_['mean_test_score']

array([0.86819297, 0.86453617, 0.86492226, 0.86780891, 0.86761605])

In [16]:
i = np.argmax(gs.cv_results_['mean_test_score'])

In [17]:
gs.cv_results_['params'][i]

{'min_impurity_decrease': 0.0001}

In [19]:
params = {'max_depth': range(5, 20, 1),
          'min_impurity_decrease': np.arange(0.0001, 0.001, 0.0001),
          'min_samples_split': range(2, 100, 10)
}

gs1 = GridSearchCV(DecisionTreeClassifier(random_state= 42), params)
gs1.fit(t_x, t_y)

GridSearchCV(estimator=DecisionTreeClassifier(random_state=42),
             param_grid={'max_depth': range(5, 20),
                         'min_impurity_decrease': array([0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008,
       0.0009]),
                         'min_samples_split': range(2, 100, 10)})

In [20]:
gs1.best_params_

{'max_depth': 14, 'min_impurity_decrease': 0.0004, 'min_samples_split': 12}

In [22]:
np.max(gs1.cv_results_['mean_test_score'])

0.8683865773302731

### 랜덤서치

In [23]:
from scipy.stats import uniform, randint

In [25]:
d = randint(0, 10)
d.rvs(5)

array([6, 4, 8, 2, 7])

In [27]:
np.unique(d.rvs(1000), return_counts= True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([ 86, 115,  96, 102, 103, 111,  92,  91, 112,  92], dtype=int64))

In [30]:
d = uniform(0, 1)
d.rvs(5)

array([0.17071465, 0.4984458 , 0.1530186 , 0.39980554, 0.58557872])

In [31]:
params = {'max_depth': randint(20, 50),
          'min_impurity_decrease': uniform(0.0001, 0.001),
          'min_samples_split': randint(2, 25),
          'min_samples_leaf': randint(1, 25) 
}

In [32]:
from sklearn.model_selection import RandomizedSearchCV

rs = RandomizedSearchCV(DecisionTreeClassifier(random_state= 42), params, n_iter= 100, random_state= 42)
rs.fit(t_x, t_y)

RandomizedSearchCV(estimator=DecisionTreeClassifier(random_state=42),
                   n_iter=100,
                   param_distributions={'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002692A3466D0>,
                                        'min_impurity_decrease': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002692A84DC70>,
                                        'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002692A5674C0>,
                                        'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002692A56B760>},
                   random_state=42)

In [33]:
rs.best_params_

{'max_depth': 39,
 'min_impurity_decrease': 0.00034102546602601173,
 'min_samples_leaf': 7,
 'min_samples_split': 13}