### This exercice is related to exercice 7

In [2]:
from sklearn.datasets import make_moons
from sklearn.model_selection import  train_test_split

X, y = make_moons(n_samples=125000, noise=0.4, random_state=42) # in order to generate 1,000 subsets of the training set each containing 100 instances after the split


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### We create shuffled subsets of the data

In [6]:
from sklearn.model_selection import ShuffleSplit

shuffle_split = ShuffleSplit(n_splits=1000, train_size=100, random_state=42)

subsets = []

for train_index, _ in shuffle_split.split(X_train, y_train): # this method generates random train indices 1000 times because of n_splits 
    subset_X = X_train[train_index]
    subset_y = y_train[train_index]
    subsets.append((subset_X, subset_y))
    

print("Number of Subsets:", len(subsets))

print("Size of One Subset (X):", subsets[0][0].shape)
print("Size of One Subset (y):", subsets[0][1].shape)

Number of Subsets: 1000
Size of One Subset (X): (100, 2)
Size of One Subset (y): (100,)


### Now we train one Decision Tree on each subset

In [7]:
from sklearn.tree import DecisionTreeClassifier

trained_trees = []

for subset_X, subset_y in subsets:
    tree_classifier = DecisionTreeClassifier(max_leaf_nodes=20, random_state=42)
    
    tree_classifier.fit(subset_X, subset_y)
    
    trained_trees.append(tree_classifier)

print("Number of Trained Decision Trees:", len(trained_trees))


Number of Trained Decision Trees: 1000


### Now we evaluate them on the test sets 

In [8]:
from sklearn.metrics import accuracy_score

test_set_accuracies = []

for tree_classifier in trained_trees:
    y_pred = tree_classifier.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    
    test_set_accuracies.append(accuracy)

average_accuracy = sum(test_set_accuracies) / len(test_set_accuracies)
print("Average Test Set Accuracy:", average_accuracy)


Average Test Set Accuracy: 0.7914484400000008


### Now for each test set instance, we generate the predictions of the 1,000 Decision Trees, and we keep only the most frequent prediction

In [None]:
import numpy as np
from scipy.stats import mode

majority_vote_predictions = []

for i in range(len(X_test)):
    tree_predictions = [tree.predict(X_test[i:i+1])[0] for tree in trained_trees]
    
    mode_prediction = mode(tree_predictions).mode[0]
    
    majority_vote_predictions.append(mode_prediction)

majority_vote_predictions = np.array(majority_vote_predictions)

print("Shape of Majority-Vote Predictions:", majority_vote_predictions.shape)
