In [1]:
import numpy as np 
import pandas as pd
from sklearn.datasets import make_moons

In [2]:
df = make_moons(n_samples=10000,noise=0.4)

In [6]:
df

(array([[-1.33333649, -0.21608747],
        [ 1.41054618, -0.43996166],
        [-0.19127534,  0.87283261],
        ...,
        [-0.08691336, -0.05472676],
        [ 1.1436592 , -0.18702979],
        [-0.50733604,  1.02818761]]),
 array([0, 1, 0, ..., 1, 1, 0]))

In [7]:
X = df[0]

In [8]:
X

array([[-1.33333649, -0.21608747],
       [ 1.41054618, -0.43996166],
       [-0.19127534,  0.87283261],
       ...,
       [-0.08691336, -0.05472676],
       [ 1.1436592 , -0.18702979],
       [-0.50733604,  1.02818761]])

In [9]:
y = df[1]

In [10]:
y

array([0, 1, 0, ..., 1, 1, 0])

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [16]:
from sklearn.model_selection import GridSearchCV
tree_clf = DecisionTreeClassifier()

params_grid = {
    'max_depth':[None,5,10,20],
    'max_leaf_nodes':[5,10,15],
    'min_samples_leaf': [1, 2, 4]

}
grid_search = GridSearchCV(tree_clf,param_grid=params_grid,scoring="accuracy")
grid_search.fit(X_train,y_train)

In [17]:
print("Best parameters:", grid_search.best_params_)

Best parameters: {'max_depth': None, 'max_leaf_nodes': 15, 'min_samples_leaf': 1}


In [18]:
best_model = grid_search.best_estimator_

In [None]:
from sklearn.metrics import accuracy_score, classification_report


y_test_pred = best_model.predict(X_test)

print("Test Accuracy:", accuracy_score(y_test, y_test_pred))

print("Classification Report:\n", classification_report(y_test, y_test_pred))


Test Accuracy: 0.8605
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.89      0.86       994
           1       0.88      0.84      0.86      1006

    accuracy                           0.86      2000
   macro avg       0.86      0.86      0.86      2000
weighted avg       0.86      0.86      0.86      2000



In [22]:
from sklearn.model_selection import ShuffleSplit
shuffle_split = ShuffleSplit(n_splits=1000,train_size=100,random_state=42)

In [24]:
subsets = []
for train_indicies, _ in shuffle_split.split(X_train):
    X_subset = X_train[train_indicies]
    y_subset = y_train[train_indicies]
    subsets.append((X_subset,y_subset))

In [25]:
accuracies = []
for X_subset,y_subset in subsets:
    tree = DecisionTreeClassifier(**grid_search.best_params_)
    tree.fit(X_subset,y_subset)
    y_test_pred = tree.predict(X_test)
    accuracy = accuracy_score(y_test,y_test_pred)
    accuracies.append(accuracy)
accuracies = np.array(accuracies)
mean_accuracy = accuracies.mean()
std_accuracy = accuracies.std()
print(f"Average accuracy of the 1,000 trees: {mean_accuracy:.2f}")
print(f"Standard deviation of accuracy: {std_accuracy:.2f}")

Average accuracy of the 1,000 trees: 0.80
Standard deviation of accuracy: 0.03


In [26]:
from scipy import stats
all_predictions = []

for X_subset, y_subset in subsets:
    tree = DecisionTreeClassifier(**grid_search.best_params_)
    tree.fit(X_subset, y_subset)
    y_pred = tree.predict(X_test)
    all_predictions.append(y_pred)
all_predictions = np.array(all_predictions)
majority_vote_predictions, _ = stats.mode(all_predictions, axis=0)
majority_vote_predictions = majority_vote_predictions.flatten()
majority_vote_accuracy = accuracy_score(y_test, majority_vote_predictions)
print(f"Majority-vote accuracy: {majority_vote_accuracy:.2f}")


Majority-vote accuracy: 0.87


In [None]:

original_tree_accuracy = accuracy_score(y_test, best_model.predict(X_test))
print(f"Original Decision Tree accuracy: {original_tree_accuracy:.2f}")
print(f"Majority-vote (Random Forest) accuracy: {majority_vote_accuracy:.2f}")
print(f"Improvement: {majority_vote_accuracy - original_tree_accuracy:.2f} %")


Original Decision Tree accuracy: 0.86
Majority-vote (Random Forest) accuracy: 0.87
Improvement: 0.01 %
