In [None]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import numpy as np

# Load MNIST dataset
mnist = fetch_openml('mnist_784', version=1, as_frame=False)
X = mnist.data
y = mnist.target.astype(int)

# Partition the data into multiple subsets
num_partitions = 5
X_partitions = np.array_split(X, num_partitions)
y_partitions = np.array_split(y, num_partitions)

# Initialize list to store centroids
centroids = []

# Loop through each partition
for X_partition, y_partition in zip(X_partitions, y_partitions):
    # Perform KNN on each partition to find the nearest neighbors
    n_neighbors = 5
    knn_model = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn_model.fit(X_partition, y_partition)
    distances, indices = knn_model.kneighbors(X_partition)

    # Calculate the centroid from each set of nearest neighbors
    centroid = np.mean(X_partition[indices], axis=1)

    # Append the centroid to the list of centroids
    centroids.append(centroid)

# Stack the centroids to form dataset R
R = np.vstack(centroids)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(R, y, test_size=0.2, random_state=42)

# Initialize classifiers
knn_model = KNeighborsClassifier()
dt_model = DecisionTreeClassifier()
rf_model = RandomForestClassifier()
gb_model = GradientBoostingClassifier()

# Define hyperparameters to tune for Decision Tree classifier
dt_param_grid = {
    'max_depth': [None, 50, 10, 30],
    'min_samples_split': [10, 15, 20],
    'min_samples_leaf': [5, 10, 15]
}

# Perform Grid Search with Cross-Validation for Decision Tree
dt_grid_search = GridSearchCV(estimator=dt_model, param_grid=dt_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
dt_grid_search.fit(X_train, y_train)
best_dt_model = dt_grid_search.best_estimator_

# Define hyperparameters to tune for Random Forest classifier
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform Grid Search with Cross-Validation for Random Forest
rf_grid_search = GridSearchCV(estimator=rf_model, param_grid=rf_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
rf_grid_search.fit(X_train, y_train)
best_rf_model = rf_grid_search.best_estimator_

# Evaluate the best models
accuracy_dt = best_dt_model.score(X_test, y_test)
accuracy_rf = best_rf_model.score(X_test, y_test)

print("Decision Tree Model Accuracy:", accuracy_dt)
print("Random Forest Model Accuracy:", accuracy_rf)


  warn(
