In [2]:
import pandas as pd

wine = pd.read_csv('https://bit.ly/wine_csv_data')
wine.head()

Unnamed: 0,alcohol,sugar,pH,class
0,9.4,1.9,3.51,0.0
1,9.8,2.6,3.2,0.0
2,9.8,2.3,3.26,0.0
3,9.8,1.9,3.16,0.0
4,9.4,1.9,3.51,0.0


In [14]:
data = wine[['alcohol', 'sugar', 'pH']]
target = wine[['class']]

In [15]:
from sklearn.model_selection import train_test_split

train_input, test_input, train_target, test_target =\
train_test_split(data, target, random_state=42)

In [16]:
sub_input, val_input, sub_target, val_target =\
train_test_split(train_input, train_target, random_state=42)

In [17]:
print(data.shape, train_input.shape, test_input.shape)
print(sub_input.shape, val_input.shape)

(6497, 3) (4872, 3) (1625, 3)
(3654, 3) (1218, 3)


In [26]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42)
dt.fit(sub_input, sub_target)

print()
print()

In [27]:
from sklearn.model_selection import cross_validate

scores = cross_validate(dt, train_input, train_target)

import numpy as np
np.mean(scores['test_score'])

np.float64(0.8546818301479492)

In [28]:
from sklearn.model_selection import StratifiedKFold

splitter = StratifiedKFold(n_splits=10, shuffle=True)
scores = cross_validate(dt, train_input, train_target, cv=splitter)
scores

{'fit_time': array([0.01452065, 0.0109539 , 0.00700665, 0.00703955, 0.00798488,
        0.00700879, 0.00806856, 0.01400042, 0.00699973, 0.00700021]),
 'score_time': array([0.00404692, 0.00303006, 0.00296354, 0.00200939, 0.00200605,
        0.00195646, 0.00500393, 0.00200033, 0.00200057, 0.0030005 ]),
 'test_score': array([0.84631148, 0.8545082 , 0.87268994, 0.87268994, 0.86858316,
        0.85420945, 0.84394251, 0.862423  , 0.83778234, 0.862423  ])}

In [29]:
from sklearn.model_selection import GridSearchCV

In [54]:
params = {
    'min_impurity_decrease' : [0.0001, 0.0002, 0.0003, 0.0004, 0.0005] 
}

In [55]:
dt = DecisionTreeClassifier(random_state=42)
gs = GridSearchCV(dt, params)

In [56]:
gs.fit(train_input, train_target)

In [57]:
gs.best_estimator_

In [58]:
gs.best_params_

{'min_impurity_decrease': 0.0003}

In [59]:
gs.cv_results_

{'mean_fit_time': array([0.00801253, 0.00941935, 0.00499554, 0.00459595, 0.00458722]),
 'std_fit_time': array([1.91765045e-03, 1.86253547e-03, 2.30122225e-05, 4.88124423e-04,
        4.83394393e-04]),
 'mean_score_time': array([0.00319557, 0.00378046, 0.00201306, 0.00220327, 0.00200438]),
 'std_score_time': array([1.46398203e-03, 1.47031055e-03, 1.75548621e-05, 3.97935176e-04,
        2.55795470e-05]),
 'param_min_impurity_decrease': masked_array(data=[0.0001, 0.0002, 0.0003, 0.0004, 0.0005],
              mask=[False, False, False, False, False],
        fill_value=1e+20),
 'params': [{'min_impurity_decrease': 0.0001},
  {'min_impurity_decrease': 0.0002},
  {'min_impurity_decrease': 0.0003},
  {'min_impurity_decrease': 0.0004},
  {'min_impurity_decrease': 0.0005}],
 'split0_test_score': array([0.87384615, 0.87076923, 0.87282051, 0.86461538, 0.86051282]),
 'split1_test_score': array([0.86666667, 0.86871795, 0.87794872, 0.88512821, 0.87794872]),
 'split2_test_score': array([0.88603696, 

In [66]:
import numpy as np

params = {
    'min_impurity_decrease' : np.arange(0.0001, 0.001, 0.0001),
    'max_depth' : range(5, 20, 1),
    'min_samples_split' : range(2, 100, 10)
}

In [67]:
dt = DecisionTreeClassifier(random_state=42)

gs = GridSearchCV(dt, params, n_jobs=-1)

In [68]:
gs.fit(train_input, train_target)

  _data = np.array(data, dtype=dtype, copy=copy,


In [70]:
gs.best_params_

{'max_depth': 15,
 'min_impurity_decrease': np.float64(0.0001),
 'min_samples_split': 22}

In [71]:
gs.cv_results_['mean_test_score']

array([0.85837161, 0.85837161, 0.85837161, ..., 0.86309693, 0.86309693,
       0.86309693])

In [72]:
from scipy.stats import uniform, randint

In [73]:
randint(0,10).rvs(10)

array([7, 3, 8, 5, 0, 9, 5, 3, 2, 8])

In [75]:
rgen = randint(0,10)
np.unique(rgen.rvs(1000), return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([ 85, 116,  80, 114, 102, 101,  97, 104, 100, 101]))

In [76]:
params = {
    'min_impurity_decrease' : uniform(0.0001, 0.001),
    'max_depth' : randint(20,50),
    'min_samples_split' : randint(2, 25),
    'min_samples_leaf' : randint(1,25)
} 

In [77]:
from sklearn.model_selection import RandomizedSearchCV

In [79]:
gs = RandomizedSearchCV(dt, params, n_iter=100, n_jobs=-1)
gs.fit(train_input, train_target)

  _data = np.array(data, dtype=dtype, copy=copy,


In [80]:
gs.best_estimator_

In [81]:
gs.best_params_

{'max_depth': 35,
 'min_impurity_decrease': np.float64(0.00021097087096670533),
 'min_samples_leaf': 2,
 'min_samples_split': 22}