In [1]:
from sklearn.datasets import fetch_openml
import numpy as np
mnist = fetch_openml('mnist_784', version=1, as_frame=False, parser='auto') 
mnist.target = mnist.target.astype(np.uint8)
X = mnist["data"]
y = mnist["target"]

In [2]:
from sklearn.cluster import KMeans

kmeans = []
predictions = []
for k in range(8, 13):
    kmeans.append(KMeans(n_clusters=k, random_state=42))
    predictions.append(kmeans[-1].fit_predict(X))



In [3]:
from sklearn.metrics import silhouette_score

silhouettes = []
for k in range(8, 13):
    silhouettes.append(silhouette_score(X, kmeans[k-8].labels_))

In [4]:
import pickle

with open('kmeans_sil.pkl', 'wb') as sil_pickle:
    pickle.dump(silhouettes, sil_pickle)

In [5]:
from sklearn.metrics import confusion_matrix

kmeans_10 = kmeans[2]
y_pred = predictions[2]

matrix = confusion_matrix(y, y_pred)
matrix

array([[ 290,    2, 1265,   39,    9,    7,  162,    4,   72, 5053],
       [   8, 4293,    7,    7,   10,   11,    7, 3526,    8,    0],
       [ 323,  423,  246,  216, 4863,   78,  147,  436,  201,   57],
       [4581,  449,  461,  193,  216,   45,   31,   58, 1083,   24],
       [   0,  178,  288, 3728,   29, 2173,  168,  234,   17,    9],
       [2129,  155, 1812,  432,    7,  215,   67,  280, 1156,   60],
       [  38,  190, 2068,   67,   53,    4, 4326,   45,   14,   71],
       [   6,  372,   12, 2094,   53, 4399,    4,  314,   18,   21],
       [1212,  335,  292,  208,   53,  193,   51,  330, 4115,   36],
       [  87,  261,   31, 3462,   19, 2849,   16,   95,   87,   51]])

In [6]:
max_index = set(np.argmax(matrix, axis=1))
max_index = list(max_index)

In [7]:
with open('kmeans_argmax.pkl', 'wb') as args_pickle:
    pickle.dump(max_index, args_pickle)

In [8]:
print(X)
X.shape[0]

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


70000

In [13]:
distances = np.zeros((300,X.shape[0]))
for i in range(300):
    for j in range(X.shape[0]):
        dist = np.linalg.norm(X[i] - X[j])
        if dist != 0:
            distances[i, j] = dist
        else:
            distances[i, j] = np.inf

In [19]:
smallest_distances = sorted(distances.ravel())[:10]
print(smallest_distances)

[279.26152617215286, 304.37641170103836, 317.5893575043093, 328.7658741414626, 333.4546445920344, 352.89800226127664, 355.1774204534967, 358.07401469528617, 359.64287842247063, 360.42474942767177]


In [20]:
with open('dist.pkl', 'wb') as dist_pickle:
    pickle.dump(smallest_distances, dist_pickle)

In [26]:
smallest_3 = smallest_distances[:3]
s = np.mean(smallest_3)

In [31]:
from sklearn.cluster import DBSCAN
dbscan_len = []
step = s
while step < s + 0.10 * s:
    dbscan = DBSCAN(eps=step)
    dbscan.fit(X)
    dbscan_len.append(len(set(dbscan.labels_)))
    step += 0.04 * s

iteration
iteration
iteration


In [33]:
with open('dbscan_len.pkl', 'wb') as dbscan_pickle:
    pickle.dump(dbscan_len, dbscan_pickle)