In [8]:
from sklearn.datasets import make_moons

X, y = make_moons(n_samples=10000, noise=0.4, random_state=42)

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
from sklearn.model_selection import ShuffleSplit

n_subsets = 1000
n_samples = 100

subsets = []

rs = ShuffleSplit(n_splits=n_subsets, test_size = len(X_train) - n_samples , random_state= 42)
for subset_train_idx , subset_test_idx in rs.split(X_train):
     X_sub_train = X_train[subset_train_idx]
     y_sub_train = y_train[subset_train_idx]
     subsets.append((X_sub_train,y_sub_train))

In [4]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

params = {'max_leaf_nodes': list(range(2, 100)), 'min_samples_split': [2, 3, 4]}
grid_search_cv = GridSearchCV(DecisionTreeClassifier(random_state=42), params, verbose=1, cv=3)

grid_search_cv.fit(X_train, y_train)

Fitting 3 folds for each of 294 candidates, totalling 882 fits


In [5]:
grid_search_cv.best_estimator_

In [11]:
from sklearn.base import clone
from sklearn.metrics import accuracy_score
import numpy as np


forest = [clone(grid_search_cv.best_estimator_) for _ in range(n_subsets)]

accuracy_scores = []

for tree, (X_sub_train, y_sub_train) in zip(forest, subsets):
    tree.fit(X_sub_train, y_sub_train)
    
    y_pred = tree.predict(X_test)
    accuracy_scores.append(accuracy_score(y_test, y_pred))

np.mean(accuracy_scores)

0.805471

In [13]:
Y_pred = np.empty([n_subsets, len(X_test)], dtype=np.uint8)

for tree_index, tree in enumerate(forest):
    Y_pred[tree_index] = tree.predict(X_test)

In [24]:
from scipy.stats import mode

# y_pred_majority_votes, n_votes = mode(Y_pred, axis=0)
y_pred_majority_votes, n_votes = mode(Y_pred)

In [15]:
accuracy_score(y_test, y_pred_majority_votes.reshape([-1]))

0.872