In [63]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy

from sklearn.datasets import make_moons
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, ShuffleSplit
from sklearn.base import clone
from sklearn.metrics import accuracy_score

In [10]:
data = make_moons(n_samples=10000, noise=.4)
X = data[0]
y = data[1]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=.2)

In [89]:
tree_clf = DecisionTreeClassifier(random_state=42,)

param_grid = {
    'max_depth':np.arange(1, 7, 1),
    'max_leaf_nodes': np.arange(2, 100, 2),
    'criterion':['gini'],
    'min_samples_split':[2,3,4]
}

tree_grid_clf = GridSearchCV(tree_clf, param_grid=param_grid, cv=3, scoring='accuracy',n_jobs=-1)
tree_grid_clf.fit(X_train, y_train)

In [90]:
best_model = tree_grid_clf.best_estimator_
test_score = best_model.score(X_test, y_test)
print('Test set accuracy: ', test_score)

Test set accuracy:  0.855


In [91]:
tree_grid_clf.best_estimator_

In [92]:
# generating 1000 subsets with 100 instances of the training set selected randomly

n_trees = 1000
n_instances = 100

mini_set = []

rs = ShuffleSplit(n_splits=n_trees, test_size=len(X_train) - n_instances, random_state=42)

for mini_train_index, mini_test_index in rs.split(X_train):
    X_train_mini = X_train[mini_train_index]
    y_train_mini = y_train[mini_train_index]
    mini_set.append((X_train_mini, y_train_mini))

In [93]:
forrest = [clone(best_model) for _ in range(n_trees)]

accuracy_scores = []
for tree, (mini_x_train, mini_y_train) in zip(forrest, mini_set):
    tree.fit(mini_x_train, mini_y_train)
    
    y_pred = tree.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    accuracy_scores.append(score)

print(np.mean(accuracy_scores))

Y_pred = np.empty([n_trees, len(X_test)], dtype=np.uint8)

for tree_index, tree in  enumerate(forrest):
    Y_pred[tree_index] = tree.predict(X_test)

0.793693


In [94]:
y_pred_majority_votes, n_votes = scipy.stats.mode(Y_pred, axis=0)

In [95]:
accuracy_score(y_test, y_pred_majority_votes.reshape([-1]))

0.8635