In [None]:
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

In [None]:
# Load MNIST (handwritten digits)
X, y = fetch_openml("mnist_784", version=1, return_X_y=True, as_frame=False)

In [None]:
# Basic KMeans Clustering
kmeans = KMeans(n_clusters=10, n_init=10, verbose=1, random_state=42)
clusters = kmeans.fit_predict(X)

In [None]:
# Evaluation
score = adjusted_rand_score(y, clusters)
print("KMeans Adjusted Rand Score:", score)


Improved: Gaussian Mixture Model (GMM)

In [None]:
from sklearn.mixture import GaussianMixture
# Gaussian Mixture Model clustering
gmm = GaussianMixture(n_components=10, covariance_type='full', random_state=42)
gmm_labels = gmm.fit_predict(X)

In [None]:
# Evaluation
score_gmm = adjusted_rand_score(y, gmm_labels)
print("Gaussian Mixture Model Adjusted Rand Score:", score_gmm)

Next: Spectral Clustering

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import SpectralClustering

In [None]:
# Standardize data for spectral clustering
X_scaled = StandardScaler().fit_transform(X)

In [None]:
# Due to computational limits, pick a subset for Spectral Clustering
subset = 2000  # otherwise, it's too slow
X_small, y_small = X_scaled[:subset], y[:subset]


In [None]:
sc = SpectralClustering(n_clusters=10, affinity='nearest_neighbors', n_neighbors=10, random_state=42)
sc_labels = sc.fit_predict(X_small)

In [None]:
score_sc = adjusted_rand_score(y_small, sc_labels)
print("Spectral Clustering Adjusted Rand Score:", score_sc)

Advanced: Deep Learning with Autoencoder + Clustering

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models

In [None]:
# Reduce the dataset size to avoid memory issues
X_train = X[:10000] / 255.0

In [None]:
# Build a deep autoencoder
input_dim = X_train.shape[1]
encoding_dim = 64

In [None]:
autoencoder = models.Sequential([
    layers.Input(shape=(input_dim,)),
    layers.Dense(512, activation='relu'),
    layers.Dense(256, activation='relu'),
    layers.Dense(encoding_dim, activation='relu'),
    layers.Dense(256, activation='relu'),
    layers.Dense(512, activation='relu'),
    layers.Dense(input_dim, activation='sigmoid')
])

In [None]:
autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(X_train, X_train, epochs=10, batch_size=256, verbose=2)


In [None]:

# Encoder for dimensionality reduction
encoder = models.Sequential(autoencoder.layers[:3])
X_encoded = encoder.predict(X_train)

In [None]:
# KMeans in reduced space
kmeans_ae = KMeans(n_clusters=10, n_init=10, random_state=42)
clusters_ae = kmeans_ae.fit_predict(X_encoded)

In [None]:
# Evaluate clustering
score_ae = adjusted_rand_score(y[:10000], clusters_ae)
print("Autoencoder + KMeans Adjusted Rand Score:", score_ae)