In [None]:
import numpy as np
from torchvision import datasets, transforms
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from joblib import parallel_backend

# Define transformation to convert images to PyTorch tensors and normalize them
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

# Download QMNIST dataset
train_dataset = datasets.QMNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.QMNIST(root='./data', train=False, download=True, transform=transform)

# Extract features and labels from the dataset
X_train = train_dataset.data.numpy()
y_train = train_dataset.targets.numpy()
X_test = test_dataset.data.numpy()
y_test = test_dataset.targets.numpy()

# Partition the data into multiple subsets
num_partitions = 5
X_partitions = np.array_split(X_train, num_partitions)
y_partitions = np.array_split(y_train, num_partitions)

# Initialize list to store centroids
centroids = []

# Loop through each partition
for X_partition, y_partition in zip(X_partitions, y_partitions):
    # Train KNN on each partition to find the nearest neighbors and compute centroid
    knn_model = KNeighborsClassifier(n_neighbors=5)
    knn_model.fit(X_partition.reshape(len(X_partition), -1), y_partition)
    distances, indices = knn_model.kneighbors(X_partition.reshape(len(X_partition), -1))
    centroid = np.mean(X_partition[indices], axis=1)
    centroids.append(centroid)

# Stack the centroids to form dataset R
R = np.vstack(centroids)

# Split the data into train and test sets for dataset R
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(R, y_train, test_size=0.2, random_state=42)

# Initialize Decision Tree classifier for dataset R
dt_model_r = DecisionTreeClassifier()

# Define hyperparameters to tune for Decision Tree classifier
param_grid_r = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform Grid Search with Cross-Validation for Decision Tree classifier for dataset R
with parallel_backend('threading'):
    grid_search_r = GridSearchCV(estimator=dt_model_r, param_grid=param_grid_r, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search_r.fit(X_train_r.reshape(len(X_train_r), -1), y_train_r)

# Get the best model for Decision Tree classifier for dataset R
best_dt_model_r = grid_search_r.best_estimator_

# Evaluate the best model on the test set for Decision Tree classifier for dataset R
accuracy_r = best_dt_model_r.score(X_test_r.reshape(len(X_test_r), -1), y_test_r)
print("Model Accuracy (Decision Tree with centroids):", accuracy_r)
