In [None]:
from tensorflow.keras.datasets import mnist
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# 加载数据
(x_train, y_train), (_, _) = mnist.load_data()
x = x_train[:10000]  # 为了加快运行，选取前10000张图片
y = y_train[:10000]

# 展平 & 标准化
x = x.reshape((x.shape[0], -1))
x = StandardScaler().fit_transform(x)

# PCA 降维到50维（便于聚类）
pca = PCA(n_components=50)
x_pca = pca.fit_transform(x)


In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

kmeans = KMeans(n_clusters=10, random_state=0)
clusters = kmeans.fit_predict(x_pca)

# 评估：ARI 越接近1越好
ari_kmeans = adjusted_rand_score(y, clusters)
print(f"K-Means ARI Score: {ari_kmeans:.4f}")


In [None]:
from sklearn.cluster import DBSCAN

dbscan = DBSCAN(eps=7, min_samples=5)
db_clusters = dbscan.fit_predict(x_pca)

# 去除-1（噪声）
mask = db_clusters != -1
ari_dbscan = adjusted_rand_score(y[mask], db_clusters[mask])
print(f"DBSCAN ARI Score: {ari_dbscan:.4f}")


In [None]:
import matplotlib.pyplot as plt

pca_2d = PCA(n_components=2).fit_transform(x)
plt.figure(figsize=(12,5))

plt.subplot(1,2,1)
plt.scatter(pca_2d[:,0], pca_2d[:,1], c=clusters, cmap='tab10', s=2)
plt.title('KMeans Clustering')

plt.subplot(1,2,2)
plt.scatter(pca_2d[:,0], pca_2d[:,1], c=db_clusters, cmap='tab10', s=2)
plt.title('DBSCAN Clustering')

plt.show()
