## K-Means clustering on the handwritten digits data

In [1]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.cluster import kmeans_plusplus, KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

from sklearn.datasets import load_digits

from sklearn.metrics import homogeneity_score
from sklearn.metrics import completeness_score
from sklearn.metrics import v_measure_score
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import adjusted_mutual_info_score
from sklearn.metrics import silhouette_score

from sklearn.inspection import DecisionBoundaryDisplay


### Digits data

In [2]:
data, labels = load_digits(return_X_y=True)
(n_samples, n_features), n_digits = data.shape, np.unique(labels).size

print(f"# digits: {n_digits}; # samples: {n_samples}; # features {n_features}")

# digits: 10; # samples: 1797; # features 64


### Pipeline


#### Quantifying the quality of clustering

Clustering algorithms are fundamentally unsupervised learning methods. However, since we happen to have class labels for this specific dataset, it is possible to use evaluation metrics that leverage this “supervised” ground truth information to quantify the quality of the resulting clusters. Examples of such metrics are the following:

* `homogeneity` --> which quantifies how much clusters contain only members of a single class;

* `completeness` --> which quantifies how much members of a given class are assigned to the same clusters;

* `V-measure` --> the harmonic mean of completeness and homogeneity;

* `Rand-Index` --> which measures how frequently pairs of data points are grouped consistently according to the result of the clustering algorithm and the ground truth class assignment;

* `Adjusted Rand-Index` --> a chance-adjusted Rand-Index such that random cluster assignment have an ARI of 0.0 in expectation.

* If the ground truth labels are not known, evaluation can only be performed using the model results itself. In that case, the `Silhouette Coefficient` comes in handy.

In [3]:
def clustering_pipe(kmeans, data, labels):
    return make_pipeline(StandardScaler(), kmeans).fit(data, labels)

def evaluate_pipe(pipe, data, labels):
    y_preds = pipe.predict(data)
    clustering_metrics = {
        "homogeneity_score" : homogeneity_score(labels, y_preds),
        "completeness_score" : completeness_score(labels, y_preds),
        "v_measure_score" : v_measure_score(labels, y_preds),
        "adjusted_rand_score"  : adjusted_rand_score(labels, y_preds),
        "adjusted_mutual_info_score" : adjusted_mutual_info_score(labels, y_preds),
        "silhouette_score" : silhouette_score(data, y_preds, sample_size=300)
    }
    return clustering_metrics

def get_metrics(kmeans, data, labels):
    pipe = clustering_pipe(kmeans, data, labels)
    score = evaluate_pipe(pipe, data, labels)
    return score


### Initialization using kmeans++

In [22]:
kmeans = KMeans(n_clusters=n_digits, init='k-means++', n_init='auto', random_state=42)
score = get_metrics(kmeans, data, labels)
score

{'homogeneity_score': 0.6721166793340757,
 'completeness_score': 0.7121728331760259,
 'v_measure_score': 0.6915652187211354,
 'adjusted_rand_score': 0.5605454087153037,
 'adjusted_mutual_info_score': 0.6883549507488198,
 'silhouette_score': 0.17881389351435184}

### Initialization using random

In [23]:
kmeans = KMeans(n_clusters=n_digits, init='random', n_init='auto', random_state=42)
score = get_metrics(kmeans, data, labels)
score

{'homogeneity_score': 0.6051268981426549,
 'completeness_score': 0.6522560665144466,
 'v_measure_score': 0.6278082357068572,
 'adjusted_rand_score': 0.47514610892960074,
 'adjusted_mutual_info_score': 0.6238985039446038,
 'silhouette_score': 0.16625854318074065}

### Initialization based on a PCA projection
* we will use the components of the PCA to initialize KMeans.
* This method is deterministic and a single initialization suffice.


In [28]:
pca = PCA(n_components=n_digits).fit(data)
#  we will use the components of the PCA to initialize KMeans. 
kmeans = KMeans(n_clusters=n_digits, init=pca.components_, n_init=1, random_state=42)
score = get_metrics(kmeans, data, labels)
score

{'homogeneity_score': 0.6362365471308777,
 'completeness_score': 0.6581481429829975,
 'v_measure_score': 0.6470068831782342,
 'adjusted_rand_score': 0.520613805758503,
 'adjusted_mutual_info_score': 0.6434066473153481,
 'silhouette_score': 0.14568312706080327}

### Visualize the results on PCA-reduced data
* PCA allows to project the data from the original 64-dimensional space into a lower dimensional space. 
*  we can use PCA to project into a 2-dimensional space and plot the data and the clusters in this new space.

In [5]:
pca = PCA(n_components=2).fit(data)
reduced_data = pca.transform(data)
plt.plot(reduced_data[:, 0], reduced_data[:, 1], marker='.', linestyle='')

<img src='./plots/dimensionality-reduction.png'>

* Use `kmeans` to cluster the data

In [6]:
kmeans = KMeans(n_clusters=n_digits, n_init='auto', random_state=42)
y_preds = kmeans.fit_predict(reduced_data)

In [116]:
colors = plt.cm.nipy_spectral(y_preds.astype('float')/n_digits)
plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=colors);

<img src='./plots/kmeans-clustering-and-pca.png'>

### Plot the decision boundary

In [7]:
# meshgrid
x_min, x_max = reduced_data[:, 0].min()-1, reduced_data[:, 0].max()+1
y_min, y_max = reduced_data[:, 1].min()-1, reduced_data[:, 1].max()+1

xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02), np.arange(y_min, y_max, 0.02))

# predictions
z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
z = z.reshape(xx.shape)

In [37]:
plt.imshow(
    z,
    interpolation="nearest",
    extent=(xx.min(), xx.max(), yy.min(), yy.max()),
    cmap=plt.cm.Paired,
    aspect="auto",
    origin="lower",)

plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c='k', edgecolors='k', alpha=0.5, s=4);

centroids = kmeans.cluster_centers_
plt.scatter(
    centroids[:, 0],
    centroids[:, 1],
    marker="x",
    s=169,
    linewidths=3,
    color="w",
    zorder=10,
);

<img src='./plots/plot-decision-boundary-for-kmeans-cluster.png'>

In [115]:
display = DecisionBoundaryDisplay(xx0=xx, xx1=yy, response=z)
display.plot()
plt.scatter(reduced_data[:,0], reduced_data[:,1], marker='.', c='k', alpha=0.5 );
centroids = kmeans.cluster_centers_
plt.scatter(
    centroids[:, 0],
    centroids[:, 1],
    marker="x",
    s=169,
    linewidths=3,
    color="w",
    zorder=10,
);


<img src='./plots/decision_boundary.png'>