In [106]:
#Importing the required libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split,GridSearchCV,ShuffleSplit
import numpy as np
from sklearn.metrics import accuracy_score

In [76]:
#Loading the dataset
moons = make_moons(n_samples=1000,noise=0.4)

In [78]:
X = moons[0]

In [80]:
y = moons[1]

In [82]:
#splitting the moons dataset into training set and a test set
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [84]:
tree_clf = DecisionTreeClassifier(random_state=42)

In [92]:
#Defining a hyperparameter grid so that we can find the best hyperparameter for the DecisionTreeClassifier
#max_leaf_nodes and max_depth are our original hyperparameter we're trying to tune
#I later added the rest to try and improve our model's performance
param_grid = {
    'max_leaf_nodes':[2,3,4,5,6,7,8,9,10,15,20,30,40,50],
    'max_depth':[2,3,4,5,6,7,8,9,10],
    'min_samples_split':[2,3,4,5],
    'min_samples_leaf':[1,2,3,4,5],
    'criterion':['gini','entropy']
}


In [94]:
#Creating a GridSearchCV object
grid_search = GridSearchCV(estimator=tree_clf,param_grid=param_grid,cv=5,scoring='accuracy')


In [96]:
#Fit the grid search to the train data
grid_search.fit(X_train,y_train)

In [100]:
#Getting the best model
best_tree_clf = grid_search.best_estimator_
best_tree_clf

In [102]:
#Making predictions on the test set
y_pred = best_tree_clf.predict(X_test)
y_pred

array([1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1,
       0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0,
       1, 1], dtype=int64)

In [104]:
#Evaluating the model's performance(accuracy)
accuracy = accuracy_score(y_test,y_pred)
print(f"Decision Tree Accuracy with Best Hyperparameters:{accuracy}")

Decision Tree Accuracy with Best Hyperparameters:0.845


In [110]:
#Creating a ShuffleSplit object
shuf_split = ShuffleSplit(n_splits=1000,train_size=100,random_state=42)


In [163]:
#Generate the subsets
tree_classifiers = []
for train_index, _ in shuf_split.split(X_train,y_train):
    X_subset = X_train[train_index]
    y_subset = y_train[train_index]
    subsets.append((X_subset,y_subset))



In [165]:
#Create and train a decision tree classifiers on the subsets
dt_classifier = DecisionTreeClassifier(max_depth=7, max_leaf_nodes=10, min_samples_leaf=4,
                       random_state=42)
dt_classifier.fit(X_subset,y_subset)
tree_classifiers.append(dt_classifier)

In [167]:
#Evaluating the decision trees on the test set
accuracies = []
for tree in tree_classifiers:
    y_pred = tree.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    accuracies.append(accuracy)
    

In [169]:
#Calculating and printing the average accuracies
average_accuracy = np.mean(accuracies)
print(f"Average accuracy of 1000 Decision Trees:{average_accuracy}")

Average accuracy of 1000 Decision Trees:0.825


In [171]:
#Generating predictions for each tree
predictions = []
for tree in tree_classifiers:
    y_pred = tree.predict(X_test)
    predictions.append(y_pred)


In [179]:
final_predictions = []
for i in range(len(X_test)):
    # Gather all predictions for the i-th instance
    instance_pred = [prediction[i] for prediction in predictions]
    
    # Find the most frequent prediction
    most_frequent_prediction = max(set(instance_pred), key=instance_pred.count)
    
    # Append the most frequent prediction to the final list
    final_predictions.append(most_frequent_prediction)


In [181]:
accuracy = accuracy_score(y_test,final_predictions)
print(f"Accuracy of the ensemble of 1000 decision trees:"{accuracy})

SyntaxError: invalid syntax. Perhaps you forgot a comma? (751706422.py, line 2)