In [25]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, train_test_split, ShuffleSplit
from sklearn.metrics import accuracy_score
from sklearn.base import clone
from scipy.stats import mode
from sklearn.datasets import make_moons

X, Y = make_moons(n_samples=10000, noise=0.4, random_state=42)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

print(f'X: {X_train.shape}')
print(f'Y: {Y_train.shape}')


X: (8000, 2)
Y: (8000,)


In [27]:
param_grid = {
    'max_depth': [3, 5, 6, 8, 10],
    'max_leaf_nodes': [10, 20, 30, 40],
    'min_samples_split': [2, 5, 10]
}

dt = DecisionTreeClassifier(random_state=42)
grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=10, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, Y_train)

best_params = grid_search.best_params_
best_accuracy = grid_search.best_score_
print("Best Parameters:", best_params)
print("Best Training Accuracy:", best_accuracy)

Best Parameters: {'max_depth': 10, 'max_leaf_nodes': 30, 'min_samples_split': 2}
Best Training Accuracy: 0.85975


In [28]:
y_pred = grid_search.predict(X_test)
test_accuracy = accuracy_score(Y_test, y_pred)
print("Test Accuracy with Best Parameters:", test_accuracy)

Test Accuracy with Best Parameters: 0.8735


In [31]:
n_trees = 100
n_instances = 100
mini_sets = []
rs = ShuffleSplit(n_splits=n_trees, test_size=len(X_train) - n_instances, random_state=42)

for mini_train_index, _ in rs.split(X_train):
    X_mini_train = X_train[mini_train_index]
    y_mini_train = Y_train[mini_train_index]
    mini_sets.append((X_mini_train, y_mini_train))

forest = [clone(grid_search.best_estimator_) for _ in range(n_trees)]
accuracy_scores = []

for tree, (X_mini_train, y_mini_train) in zip(forest, mini_sets):
    tree.fit(X_mini_train, y_mini_train)
    y_pred = tree.predict(X_test)
    accuracy_scores.append(accuracy_score(Y_test, y_pred))

average_single_tree_accuracy = np.mean(accuracy_scores)
print("Average Single Tree Accuracy:", average_single_tree_accuracy)

Average Single Tree Accuracy: 0.797075


In [33]:
Y_pred = np.empty([n_trees, len(X_test)], dtype=np.uint8)
for tree_index, tree in enumerate(forest):
    Y_pred[tree_index] = tree.predict(X_test)

y_pred_majority_votes, _ = mode(Y_pred, axis=0)
y_pred_majority_votes = y_pred_majority_votes.reshape([-1])
ensemble_accuracy = accuracy_score(Y_test, y_pred_majority_votes)

print("Ensemble Accuracy:", ensemble_accuracy)

Ensemble Accuracy: 0.872
