In [136]:
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

# 7) Train and fine-tune a decision tree for the moons dataset.
X, y = make_moons(n_samples=10000, noise=0.4)
X_train, X_test, y_train, y_test = train_test_split(X, y)

dt = DecisionTreeClassifier()
parameters = {
    'max_depth': [2, 4, 8, 10],
    'criterion': ['gini', 'entropy'],
    'min_samples_leaf': [0.05, 0.1, 0.2, 0.4]
}

clf = GridSearchCV(
    estimator=dt,
    param_grid=parameters,
    cv=3,
    n_jobs=-1
)

clf.fit(X_train , y_train)

dt_tuned = DecisionTreeClassifier(**clf.best_params_)
dt_tuned.fit(X_train, y_train)

y_pred = dt_tuned.predict(X_test)
accuracy_score(y_test, y_pred)
# Nice, we got a 0.86 accuracy

0.856

In [134]:
from sklearn.model_selection import ShuffleSplit
import numpy as np
from scipy import stats

def generate_tree(X, y, Classifier, params, splits):
    trees = []
    for idx, _ in indexes.split(X):
        small_tree = Classifier(**params)
        small_tree.fit(X[idx, :], y[idx])
        trees.append(small_tree)
    return trees

indexes = ShuffleSplit(n_splits=1000, train_size=0.01)
trees = generate_tree(X, y, DecisionTreeClassifier, clf.best_params_, indexes)

scores = [accuracy_score(y_test, tree.predict(X_test)) for tree in trees]
print(np.mean(scores))

0.8138204


In [135]:
import pandas as pd

predicts = np.zeros((len(X_test), len(trees)))

for i in range(len(trees)):
    predicts[:, i] = trees[i].predict(X_test)
    
trees_predicts_df = pd.DataFrame(predicts, columns=[f'Tree {i}' for i in range(0, len(trees))])
y_trees_mode = trees_predicts_df.apply(lambda row: stats.mode(row).mode[0], axis=1)

accuracy_score(y_test, y_trees_mode)

0.862