# kMeans Clustering

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.simplefilter("ignore")
import data_gen

ModuleNotFoundError: No module named 'sklearn'

## Visualize the data

In [None]:
blob_data = data_gen.four_blobs()
uneven_blobs = data_gen.four_blobs(100, 100, 400, 400)
mouse_data = data_gen.mouse_shape()
moons_data = data_gen.two_moons()
circle_data = data_gen.circle()
noise_data = data_gen.noise()
plot_data = [blob_data, uneven_blobs, mouse_data, moons_data, circle_data, noise_data]

fig, axes = plt.subplots(2, 3, sharex=True, sharey=True, figsize=(16, 9))
for ax, data in zip(axes.flatten(), plot_data):
    ax.scatter(data[:, 0], data[:, 1], color="#266662", marker=".")
fig.savefig("cluster_plots.png", dpi=200, bbox_inches="tight")

## kMeans

To visualize how kMeans works, each tieration is plotted individually

In [None]:
from sklearn.cluster import KMeans

In [None]:
init_clusters = np.array([[0, 0], [1, 1], [0.5, 0.5], [0, 1]])
kmeans = KMeans(n_clusters=4,
                init="k-means++",
                n_init=1,
                max_iter=3,
                algorithm="lloyd",
                random_state=1)
assignment = kmeans.fit_predict(blob_data)
centers = kmeans.cluster_centers_


colors = ["#266662", "#9E5E9B", "#ED5654", "#B68E15"]
markers = ["o", "x", "<", "+"]
plt.figure(figsize=(8,6))
for cluster in range(kmeans.n_clusters):
    cluster_indeces = assignment == cluster
    plt.scatter(blob_data[cluster_indeces, 0], blob_data[cluster_indeces, 1],
                c=colors[cluster], marker=markers[cluster], alpha=0.5)
    
    plt.scatter(centers[cluster, 0], centers[cluster, 1],
               c="#0D2121", marker=markers[cluster], s=160)
    
plt.savefig("k_means_4_it.png", dpi=200)
plt.show()


## Evaluate Number of Clusters with Silhouette Analysis

In [None]:
import matplotlib.cm as cm
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.datasets import make_blobs
np.random.seed(seed=1)
centers = np.random.randint(low=3, high=10)

data, y = make_blobs(
    n_samples=500,
    n_features=2,
    centers=centers,
    cluster_std=1.5,
    center_box=(-10.0, 10.0),
    shuffle=True,
    random_state=1,
)

n_clusters = 2
fig, (ax1, ax2) = plt.subplots(1, 2)
ax1.set_xlim([-0.1, 1])
ax1.set_ylim([0, len(data) + (n_clusters + 1) * 10])

kmeans = KMeans(n_clusters=n_clusters, random_state=10)
assignment = kmeans.fit_predict(data)

silhouette_avg = silhouette_score(data, assignment)
print("For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg)

sample_silhouette_values = silhouette_samples(data, assignment)
print(sample_silhouette_values.shape)

y_lower = 10
for i in range(n_clusters):

    ith_cluster_silhouette_values = sample_silhouette_values[assignment == i]
    ith_cluster_silhouette_values.sort()

    size_cluster_i = ith_cluster_silhouette_values.shape[0]
    y_upper = y_lower + size_cluster_i

    color = cm.nipy_spectral(float(i) / n_clusters)
    ax1.fill_betweenx(
        np.arange(y_lower, y_upper),
        0,
        ith_cluster_silhouette_values,
        facecolor=color,
        edgecolor=color,
        alpha=0.7,
    )

    # Label the silhouette plots with their cluster numbers at the middle
    ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
    
    # Compute the new y_lower for next plot
    y_lower = y_upper + 10  # 10 for the 0 samples
    
ax1.set_title("The silhouette plot for the various clusters.")
ax1.set_xlabel("The silhouette coefficient values")
ax1.set_ylabel("Cluster label")

ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

ax1.set_yticks([])  # Clear the yaxis labels / ticks
ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

colors = cm.nipy_spectral(assignment.astype(float) / n_clusters)
ax2.scatter(
    data[:, 0], data[:, 1], marker=".", s=30, lw=0, alpha=0.7, c=colors, edgecolor="k"
)
plt.show()

### kMeans Clustering Characteristics

In [None]:
warnings.simplefilter("ignore")

fig, axes = plt.subplots(2, 3, sharex=True, sharey=True, figsize=(16, 9))
n_clusters = [4, 4, 3, 2, 2, 2]
index = 0
for ax, data in zip(axes.flatten(), plot_data):
    kmeans = KMeans(n_clusters=n_clusters[index])
    assignment = kmeans.fit_predict(data)
    centers = kmeans.cluster_centers_
    index += 1
    for cluster in range(kmeans.n_clusters):
        cluster_indeces = assignment == cluster
        ax.scatter(data[cluster_indeces, 0], data[cluster_indeces, 1],
                    c=colors[cluster], marker=markers[cluster], alpha=0.5)

        ax.scatter(centers[cluster, 0], centers[cluster, 1],
                   c="#0D2121", marker=markers[cluster], s=160)

fig.savefig("kmeans_all_datasets.png", dpi=200, bbox_inches="tight")