# Load Data

In [1]:
from sklearn.datasets import make_moons

data = make_moons(n_samples=10000, noise= 0.4)
X = data[0]
y = data[1]

# Explore Data

In [2]:
import pandas as pd
import numpy as np

y.mean() #Values are either 0 or 1
X.mean() # values range from -2.17 to 3.23

0.3772409218897265

# Split Data

In [3]:
from sklearn.model_selection import train_test_split 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state=42)

# Train Decision Tree using Grid Search
Typically, you would train several models. 

Gini impurity is the default but I have assigned it here to remind us that is still there and that we have an option of using Entropy impurity instead. 

max_leaf_nodes = maximum number of leaf nodes
min_samples_split = minimum number of instances per node before it can be split. 

In [4]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

tree_clf = DecisionTreeClassifier(random_state= 42, criterion= 'gini') #Use Gini not entropy
parameters = {'max_leaf_nodes' : list(range(2, 100)), 'min_samples_split' : list(range(2, 10))}
grid__tree = GridSearchCV(estimator= tree_clf, cv= 3, error_score= 'accuracy_score',
                         param_grid=parameters, verbose= 1)

grid__tree.fit(X_train, y_train)

Fitting 3 folds for each of 784 candidates, totalling 2352 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 2352 out of 2352 | elapsed:   33.3s finished


GridSearchCV(cv=3, error_score='accuracy_score',
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=42,
                                              splitter='best'),
             iid='deprecated', n_jobs=None,
             param_grid={'max_leaf_nodes': [2, 3, 4, 5, 6, 7, 8, 9,

In [5]:
best_grid = grid__tree.best_estimator_

In [6]:
from sklearn.metrics import accuracy_score

y_pred = best_grid.predict(X_test)
accuracy_score(y_pred, y_test)

0.868

# 85.95% accuracy is not bad. 

#### Let's try use ShuffledSplit to create a Random Forest Classifer (without actually using SKLearn's RandomForest classifier).

Hopefully, this will improve our results!! 

#### First, Shuffle the Data set into seperate test sets

In [7]:
from sklearn.model_selection import ShuffleSplit

ss = ShuffleSplit(random_state= 42, n_splits=1000, train_size= 100) #split the set by 1000

mini_tests = []

for train_index, test_index in ss.split(X_train):
    X_split_train = X_train[train_index]
    y_split_train = y_train[train_index]
    mini_tests.append((X_split_train, y_split_train))


#### now that we have 1000 subsets of 100 instances, we can now train our forest using the best_estimator value produced by the Grid Search CV. 

### We need to reproduce the same tree from before, <u> do not fit the same tree with the same model across the small samples </u> . If you train the same model over and over again, you will see accuracy jump up to 90%. This is because you are overfitting the data. I have kept the incorrect code hashed out, explore the difference. The sklearn's 'clone' function clones the model again and again. 


### Correctly implementing the forest will lead to a reduction in the mean accuracy score because there is a smaller data set to train the model to. 

In [8]:
# wrong_accuracy_scores= []

# for sets in mini_tests:
#     X_mini = sets[0]
#     y_mini = sets[1]
#     best_grid.fit(X_mini, y_mini)
#     y_mini_pred = best_grid.predict(X_mini)
#     wrong_accuracy_scores.append(accuracy_score(y_mini_pred, y_mini))
# print(np.mean(wrong_accuracy_scores))


import numpy as np
from sklearn.base import clone

forests = [clone(best_grid) for _ in range(1000)]

accuracy_scores= []

for forest, sets in zip(forests, mini_tests):
    X_mini = sets[0]
    y_mini = sets[1]
    forest.fit(X_mini, y_mini)
    y_mini_pred = best_grid.predict(X_mini)
    accuracy_scores.append(accuracy_score(y_mini_pred, y_mini))

np.mean(accuracy_scores)

0.86616

### Find the Mode across each prediction, across the 1000's of subsets, using SciPy's mode() function. This gives the majority vote prediction over the test set. 

In [9]:
import numpy as np 

#create an empty array where you will hold all the predictions of y for each tree
#Splitting the subset where the rows represent a tree's prediction
#and each Column represents  prediction for that number of X_test
y_pred = np.empty([1000, len(X_test)], dtype=np.uint8)  #convert all numbers into integers


for tree_index, tree in enumerate(forests):
    y_pred[tree_index] = tree.predict(X_test)
# best_grid.predict(X_test) #reminder that .predict gives you a predictions as a 1 x m matrix.

####  y_pred are the predictions for X_test where the rows represent a trained tree (i.e. 1000 trees) and the columns represents the predictions of X_test for each instance
for example: 
                

###### trees|            X_test_predictions    
######                      0    |  0     1     0     1    0    1

######                      1    |  1     0     1     0    1    1

######                      2    |  0     1     0     1    0    0  

######                      3    |  0     1     0     1    0    1 

In [10]:
#find the most 'voted' prediction across all the trees and use that to calculate the accuracy 

from scipy.stats import mode

y_pred_forest_mode, votes = mode(y_pred, axis=0)
y_pred_forest_mode#.reshape(-1,1) #data so that each instance's prediction is a row not column

array([[1, 1, 1, ..., 0, 0, 0]], dtype=uint8)

In [11]:
accuracy_score(y_pred_forest_mode.reshape(-1,1), y_test)

0.8715

# 86.25% accuracy is a marginal increase compared to the 85.95% accuracy previously. 

# A 0.30% accuracy increase for manual implementation of a RandomForestClassifier.
Now you know how it works! Glad we have the Class to do all this for us!

