# Supervised Learning with MNIST Data Set

In [7]:
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score

# Load MNIST dataset
mnist = fetch_openml('mnist_784', version=1)
X, y = mnist['data'], mnist['target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Perform k-means clustering on the training set
kmeans = KMeans(n_clusters=10, random_state=42)
X_train_clusters = kmeans.fit_predict(X_train)

# Assign cluster labels to each data point in the test set
X_test_clusters = kmeans.predict(X_test)

# Find the most frequent true label for each cluster in the training set
cluster_labels = []
for cluster in range(10):
    cluster_indices = np.where(X_train_clusters == cluster)[0]
    cluster_labels.append(np.argmax(np.bincount(y_train[cluster_indices].astype(int))))

# Predict the labels for the test set using the most frequent label for each cluster
y_pred = [cluster_labels[cluster] for cluster in X_test_clusters]

# Calculate accuracy
accuracy = accuracy_score(y_test.astype(int), y_pred)
print("Accuracy with k-means stratification:", accuracy)


  warn(
