## Train and fine-tune a Decision Tree for the moons dataset

In [3]:
from sklearn.datasets import make_moons

moon = make_moons(n_samples=10000, noise=0.4)
X = moon[0]
y = moon[1]

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

In [5]:
from sklearn.model_selection import RandomizedSearchCV
from numpy.random import randint

tree_clf = RandomizedSearchCV(
    estimator=DecisionTreeClassifier(),
    param_distributions=dict(max_depth=list(range(4,9)),
                             max_leaf_nodes=list(range(6, 100)),
                             criterion=('gini', 'entropy', 'log_loss'),
                             splitter=('best', 'random')),
    n_iter=1000,
    cv=3)

In [6]:
tree_clf.fit(X_train, y_train)

In [7]:
tree_clf.best_params_

{'splitter': 'random',
 'max_leaf_nodes': 34,
 'max_depth': 8,
 'criterion': 'entropy'}

In [8]:
tree_clf.best_score_

0.857498589120471

## Make a Random Forest out of a 1000 Decision Trees

In [107]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(max_depth=7, max_leaf_nodes=18)
clf.fit(X_train, y_train)
tree_pred = clf.predict(X_val)

In [108]:
from sklearn.metrics import accuracy_score

accuracy_score(y_val, tree_pred)

0.854

In [109]:
from sklearn.model_selection import ShuffleSplit

subsets = ShuffleSplit(n_splits=1000, train_size=0.01, test_size=0.002)

In [19]:
subsets.get_n_splits(X)

1000

In [105]:
from scipy.stats import mode
import numpy as np

instance = 0
subset_pred = np.zeros(shape=(subsets.n_splits, len(y_val)))
y_val_predictions = np.zeros(len(y_val))
for train_index, test_index in subsets.split(X):
    clf.fit(X[train_index], y[train_index])
    clf_pred = clf.predict(X_val)
    subset_pred[instance] = clf_pred
    instance += 1
for set_pred in range(len(subset_pred[0])):
    majority_voting = mode(subset_pred[:, set_pred])
    y_val_predictions[set_pred] = majority_voting.mode[0]

In [110]:
np.unique(y_val_predictions, return_counts=True)

(array([0., 1.]), array([ 969, 1031]))

In [111]:
accuracy_score(y_val, y_val_predictions)

0.859

In [72]:
y_val_predictions

array([1., 1., 1., ..., 0., 0., 0.])