### Simple GridsearchCV with DecisionTreeClassifer 

End of chapter (6) exercise from Aurelien Geron's famous *Hands-On Machine Learning with Scikit-Learn & Tensorflow*.

In [1]:
from sklearn.datasets import make_moons

X, y = make_moons(n_samples=10000, noise=0.4, random_state=42)

X, y

(array([[ 0.9402914 ,  0.12230559],
        [ 0.12454026, -0.42477546],
        [ 0.26198823,  0.50841438],
        ...,
        [-0.24177973,  0.20957199],
        [ 0.90679645,  0.54958215],
        [ 2.08837082, -0.05050728]]),
 array([1, 0, 0, ..., 1, 0, 1], dtype=int64))

In [2]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
import numpy as np

# get notion of max depth's theoretical limit
print(f'Approx. tree depth without restrictions: {np.ceil(np.log2(len(X_train))):0.0f}')

Approx. tree depth without restrictions: 13


In [27]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

params = {'max_leaf_nodes': list(range(2, 200)), 
          'min_samples_split': [2, 3, 4, 5, 6]}

grid_search_cv = GridSearchCV(DecisionTreeClassifier(random_state=42), 
                              params, n_jobs=6, verbose=1, cv=3)

grid_search_cv.fit(X_train, y_train)

Fitting 3 folds for each of 990 candidates, totalling 2970 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  40 tasks      | elapsed:    1.9s
[Parallel(n_jobs=6)]: Done 2060 tasks      | elapsed:   10.6s
[Parallel(n_jobs=6)]: Done 2970 out of 2970 | elapsed:   14.7s finished


GridSearchCV(cv=3, estimator=DecisionTreeClassifier(random_state=42), n_jobs=6,
             param_grid={'max_leaf_nodes': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                            13, 14, 15, 16, 17, 18, 19, 20, 21,
                                            22, 23, 24, 25, 26, 27, 28, 29, 30,
                                            31, ...],
                         'min_samples_split': [2, 3, 4, 5, 6]},
             verbose=1)

In [28]:
grid_search_cv.best_estimator_

DecisionTreeClassifier(max_leaf_nodes=17, random_state=42)

In [29]:
sorted(grid_search_cv.cv_results_.keys())

['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'param_max_leaf_nodes',
 'param_min_samples_split',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split1_test_score',
 'split2_test_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score']

**A. Geron**: By default, `GridSearchCV` trains the best model found on the whole training set (you can change this by setting `refit=False`), so we don't need to do it again. We can simply evaluate the model's accuracy:

In [30]:
from sklearn.metrics import accuracy_score

y_pred = grid_search_cv.predict(X_test)
accuracy_score(y_test, y_pred)

0.8695

In [31]:
for i,v in enumerate(grid_search_cv.cv_results_['mean_test_score']):
    if v == max(grid_search_cv.cv_results_['mean_test_score']):
        print('Max mean test accuracy:', round(v,4), \
              '\nParams:', grid_search_cv.cv_results_['params'][i])

Max mean test accuracy: 0.8555 
Params: {'max_leaf_nodes': 17, 'min_samples_split': 2}
Max mean test accuracy: 0.8555 
Params: {'max_leaf_nodes': 17, 'min_samples_split': 3}
Max mean test accuracy: 0.8555 
Params: {'max_leaf_nodes': 17, 'min_samples_split': 4}
Max mean test accuracy: 0.8555 
Params: {'max_leaf_nodes': 17, 'min_samples_split': 5}
Max mean test accuracy: 0.8555 
Params: {'max_leaf_nodes': 17, 'min_samples_split': 6}


In [32]:
# train with those params in entire training set
dtree = DecisionTreeClassifier(random_state=42, min_samples_split=2, # or 3, 4, 5, 6...
                               max_leaf_nodes=17)
dtree.fit(X_train, y_train)

# predict on test set
y_preds = dtree.predict(X_test)

# print accuracy on test target
accuracy_score(y_test, y_preds)

0.8695

---

### Grow a Random Forest

With 1000 trees of 100 instances each.

In [38]:
from sklearn.model_selection import ShuffleSplit

subsets = []

rs = ShuffleSplit(n_splits=1000, test_size=len(X_train) - 100, random_state=42)
for train_sub_ix, test_sub_ix in rs.split(X_train):
    X_sub_train = X_train[train_sub_ix]
    y_sub_train = y_train[train_sub_ix]
    subsets.append((X_sub_train, y_sub_train))

In [39]:
subsets[0][0][:10], subsets[0][1][:10]

(array([[-0.31532549,  0.49432266],
        [ 1.07395888, -0.38300687],
        [ 1.2336808 , -0.20272754],
        [ 1.45327595, -0.49765049],
        [ 0.62940312, -0.45805718],
        [ 1.31621613, -0.49634063],
        [ 0.66160502, -0.52512066],
        [ 1.17772151,  0.21289673],
        [ 1.27074026,  0.83761848],
        [ 0.24077774, -0.40528032]]),
 array([1, 1, 0, 1, 1, 1, 1, 1, 1, 1], dtype=int64))

In [46]:
from sklearn.base import clone

forest = [clone(grid_search_cv.best_estimator_) for _ in range(1000)]

In [54]:
forest[:5]

[DecisionTreeClassifier(max_leaf_nodes=17, random_state=42),
 DecisionTreeClassifier(max_leaf_nodes=17, random_state=42),
 DecisionTreeClassifier(max_leaf_nodes=17, random_state=42),
 DecisionTreeClassifier(max_leaf_nodes=17, random_state=42),
 DecisionTreeClassifier(max_leaf_nodes=17, random_state=42)]

In [51]:
accuracy_scores = []

# train 1000 decision tree classifiers (but... are't they all the same? Why do we need to clone...)
for tree, (X_sub_train, y_sub_train) in zip(forest, subsets):
    tree.fit(X_sub_train, y_sub_train)
    
    y_pred = tree.predict(X_test)
    accuracy_scores.append(accuracy_score(y_test, y_pred))

np.mean(accuracy_scores)

0.8054499999999999

In [82]:
# my old code
X_rf, y_rf = [], []
for i,v in enumerate(rs.split(X_train, y_train)):
    X_rf.append(X_train[v[0]])
    y_rf.append(y_train[v[0]])

clf = DecisionTreeClassifier(random_state=42, max_leaf_nodes=17)

accs = []
for ix, tree in enumerate(X_rf):
    clf.fit(tree, y_rf[ix])
    y_pred = clf.predict(X_test)
    accs.append(accuracy_score(y_test, y_pred))

In [83]:
np.mean(accs)

0.8054499999999999

**Magic:** 

- for each test set instance, generate the predictions of the 1000 trees 
- keep only the most frequent prediction (the *mode*)

This procedure gives you the majority-vote predictions over the test set.

In [84]:
Y_pred = np.empty([1000, len(X_test)], dtype=np.uint8)

In [85]:
Y_pred.shape # just a 1k by 2k empty matrix

(1000, 2000)

In [87]:
for tree_ix, tree in enumerate(forest):
    Y_pred[tree_ix] = tree.predict(X_test)

In [88]:
forest[0].predict(X_test)

array([0, 1, 0, ..., 0, 0, 1], dtype=int64)

In [89]:
forest[1].predict(X_test)

array([1, 1, 1, ..., 0, 0, 0], dtype=int64)

In [90]:
from scipy.stats import mode

y_pred_majority_votes, n_votes = mode(Y_pred, axis=0)

In [91]:
accuracy_score(y_test, y_pred_majority_votes.reshape([-1]))

0.872

In [92]:
# my old code
from scipy import stats

y_preds = []
for ix, tree in enumerate(X_rf):
    clf.fit(tree, y_rf[ix])
    y_preds.append(clf.predict(X_test))

In [93]:
y_preds_array = np.vstack(y_preds)

In [94]:
majority_vote = stats.mode(y_preds_array)

In [95]:
majority_vote[0][0]

array([1, 1, 0, ..., 0, 0, 0], dtype=int64)

In [96]:
accuracy_score(y_test, majority_vote[0][0])

0.872

---