In [1]:
from sklearn.datasets import make_moons

X, y = make_moons(n_samples=1000, noise=0.1, random_state=0)

In [2]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [5]:
from sklearn.model_selection import GridSearchCV

from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler



# Create a pipeline
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', DecisionTreeClassifier())
])

# Create a parameter grid
param_grid = [
    {'classifier': [DecisionTreeClassifier()],
        'classifier__max_depth': [2, 4, 6, 8, 10, 12, 14, 16, 18, 20],
        'classifier__min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]},
    {'classifier': [DecisionTreeClassifier()],
        'classifier__max_depth': [2, 4, 6, 8, 10, 12, 14, 16, 18, 20],
        'classifier__min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}
]

# Create a grid search object
grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)

# Fit the grid search
grid_search.fit(X_train, y_train)

# Get the best estimator
grid_search.best_estimator_

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


In [6]:
from sklearn.metrics import accuracy_score

y_pred = grid_search.predict(X_test)
accuracy_score(y_test, y_pred)

0.976

In [12]:
from sklearn.model_selection import ShuffleSplit
n_trees=1000
n_instances=100
mini_sets=[]
rs=ShuffleSplit(n_splits=n_trees,test_size=len(X_train)-n_instances,random_state=42)

for mini_train_index,mini_test_index in rs.split(X_train):
    X_mini_train=X_train[mini_train_index]
    y_mini_train=y_train[mini_train_index]
    mini_sets.append((X_mini_train,y_mini_train))
mini_sets

[(array([[-0.65365678,  0.76324506],
         [ 0.00321173,  1.01155499],
         [-0.00279227,  1.16486419],
         [-0.46287804,  0.92104748],
         [ 0.59872025, -0.2936223 ],
         [ 0.58561657,  0.66328337],
         [-0.37313942,  1.10379892],
         [ 1.61998447, -0.27578745],
         [ 1.90766447, -0.13152623],
         [-0.13912966,  1.0275853 ],
         [ 2.01800474,  0.12600348],
         [ 0.79859172,  0.53601201],
         [ 0.00531817,  0.27651236],
         [ 0.59859956,  0.82551311],
         [ 0.84696608, -0.48976481],
         [ 0.76363784,  0.49502268],
         [ 0.09312473,  0.92142142],
         [ 0.48253117,  1.05755909],
         [ 0.86465091, -0.4571322 ],
         [-0.80544238,  0.8441547 ],
         [ 0.02359981,  0.97438665],
         [ 0.14376621,  0.20395777],
         [-0.09803521,  0.18008157],
         [ 1.85998578,  0.00728369],
         [ 0.36129139,  0.79273618],
         [ 1.93589908,  0.62660412],
         [ 0.75688802, -0.28673002],
 

In [15]:
from sklearn.base import clone
import numpy as np
forest = [clone(grid_search.best_estimator_) for _ in range(n_trees)]
accuracy_scores = []

for tree,( x_mini_train, y_mini_train) in zip(forest, mini_sets):
    tree.fit(x_mini_train, y_mini_train)
    y_pred = tree.predict(X_test)
    accuracy_scores.append(accuracy_score(y_test, y_pred))

np.mean(accuracy_scores)

0.928412

In [16]:
Y_pred = np.empty([n_trees, len(X_test)], dtype=np.uint8) # Empty array to store predictions
for tree_index, tree in enumerate(forest):
    Y_pred[tree_index] = tree.predict(X_test)

# Majority vote
from scipy.stats import mode
y_pred_majority_votes, n_votes = mode(Y_pred, axis=0)

# Accuracy
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred_majority_votes.reshape([-1])))

0.968


  y_pred_majority_votes, n_votes = mode(Y_pred, axis=0)
