In [2]:
import os
import sys
module_path = os.path.abspath(os.path.join('../src'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
%matplotlib inline

In [3]:
mnist = fetch_openml('mnist_784', version=1, cache=True)

In [4]:
X = mnist.data.values
y = mnist.target

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=2021)

In [None]:
fig, ax = plt.subplots(2,5)
for i, ax in enumerate(ax.flatten()):
    plottable_image = np.reshape(X[i], (28, 28))
    ax.imshow(plottable_image, cmap='gray_r')

# K Means (sklearn)

In [63]:
from sklearn.cluster import KMeans

In [64]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=10)

In [65]:
kmeans.fit(X_train)

KMeans()

In [66]:
kmeans.cluster_centers_

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [67]:
from sklearn.decomposition import PCA

In [69]:
reduced_data = PCA(n_components=2).fit_transform(X)
kmeans2 = KMeans(init="k-means++", n_clusters=10, n_init=4)
kmeans2.fit(reduced_data)

KMeans(n_clusters=10, n_init=4)

In [None]:
# Step size of the mesh. Decrease to increase the quality of the VQ.
h = .02     # point in the mesh [x_min, x_max]x[y_min, y_max].

# Plot the decision boundary. For that, we will assign a color to each
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

# Obtain labels for each point in mesh. Use last trained model.
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(1)
plt.clf()
plt.imshow(Z, interpolation="nearest",
           extent=(xx.min(), xx.max(), yy.min(), yy.max()),
           cmap=plt.cm.Paired, aspect="auto", origin="lower")

plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
# Plot the centroids as a white X
centroids = kmeans.cluster_centers_
plt.scatter(centroids[:, 0], centroids[:, 1], marker="x", s=169, linewidths=3,
            color="w", zorder=10)
plt.title("K-means clustering on the digits dataset (PCA-reduced data)\n"
          "Centroids are marked with white cross")
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
plt.show()

# KMeans

In [4]:
from kmeans import KMeans

In [5]:
kmeans = KMeans(k=10, max_iterations=100)

In [None]:
kmeans.fit(X)

In [177]:
kmeans.predict(X[[20000]])

array([8.])

In [182]:
from sklearn.cluster import KMeans
kmeans_sk = KMeans(n_clusters=10, max_iter=100, init='random')
kmeans_sk.fit(X)

KMeans(init='random', max_iter=100, n_clusters=10)

In [183]:
#0, 3
kmeans_sk.predict(X[[20000]])

array([0], dtype=int32)