# Exercise in Clustering (Solutions)

In the code you will see:

> &nbsp;  
> \# YOUR CODE HERE  
> ...
> <br><br>

Insert your code to complete the exercise.

<br><br>

### Exercise: Find number of iris types in iris dataset

The `Iris` dataset contains 3 different subspecies of flowers:

- Iris setosa
- Iris versicolor
- Iris viginica

Does the dataset clearly reflect this or would an unbiased algorithm find a different number of species?

Let's try cluster analysis to answer this.

In [None]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris

# YOUR CODE HERE

# import KMeans, AffinityPropagation and DBSCAN

# ---

from sklearn.metrics import silhouette_score


In [None]:
# load data

iris = load_iris()
X = iris.data

In [None]:
# k-Means
max_k = 5
scores = {}
inertias = {}

for k in range(2, max_k+1):

    # YOUR CODE HERE

    # define and train a classifier km_cl
    
    # ---

    inertias[k] = km_cl.inertia_
    scores[k] = silhouette_score(X, km_cl.labels_, metric='euclidean')

best = [i for i in scores.keys() if scores[i] == max(scores.values())][0]
print(f"Silhouette for k-Means suggests {best:0d} clusters")

In [None]:
# visualize scores

ss = np.array([list(scores.keys()), list(scores.values())]).T
max_ss = max(ss[:, 1])
best = [i for i, ss in enumerate(ss[:, 1]) if ss == max_ss][0]
best_k, best_ss = ss[best, 0], ss[best, 1]

inert = np.array([list(inertias.keys()), list(inertias.values())]).T
best_k_inert, best_inert = inert[1, 0], inert[1, 1]
fig, ax = plt.subplots(1, 2, figsize=(10, 4))

ax[0].plot(ss[:, 1], ss[:, 0])
ax[0].scatter(best_ss, best_k, c="white", s=80, edgecolors="red")
ax[0].set_title(f"Best silhouette score: {best_ss:.3f} for k={best_k:.0f}")

ax[1].plot(inert[:, 1], inert[:, 0])
plt.scatter(best_inert, best_k_inert, c="white", s=80, edgecolors="red")
ax[1].set_title(f"Inertia: total variance changes most at k={best_k_inert:.0f}")

plt.yticks(list(scores.keys()))
plt.show()


`Silhouette score` is the the distance of clusters ($b$) compared to the intra-cluster distance ($a$) of samples: $ ss = \dfrac{(b - a)}{max(a, b)} $. It's a density score.

`Inertia` is the weighted sum of squared distances of samples to theis closest cluster center. It's the variance of distances.

In [None]:
if best_k != best_k_inert:
    print("Silhouette score and inertia do not agree on the cluster size")
    print(f"   Silhouette score: {best_k:0.0f}, inertia suggests: {best_k_inert:0.0f} clusters")
    print(f"-> Choose number of clusters between {best_k:0.0f} or {best_k_inert:0.0f}")
else:
    print("Silhouette score and inertia agree on the cluster size")
    print(f"   Silhouette score: {best_k:0.0f}, inertia suggests: {best_k_inert:0.0f} clusters")
    print(f"-> Number of clusters found: {best_k:0.0f}")

In [None]:
# Affinity

# YOUR CODE HERE

# define an af classifier and return the number of clusters found in k_af

# af =
# k_af =

# cluster_centers_indices = 
# labels = 

# ---

print(f"Affinity propagation finds {k_af} clusters")


In [None]:
# visualize

import seaborn as sns
colors = sns.color_palette("tab10")

for c, col in zip(range(k_af), colors):
    x = X[:, 0][af.labels_ == c]
    y = X[:, 1][af.labels_ == c]
    plt.scatter(af.cluster_centers_[c][0],
                af.cluster_centers_[c][1],
                s=80, color=col, edgecolors="#000000")
    plt.scatter(x, y, alpha=0.5, color=col)
    for i in range(len(x)):
        plt.plot([x[i], af.cluster_centers_[c][0]],
                 [y[i], af.cluster_centers_[c][1]],
                 alpha=0.5, color=col, linewidth=1.0)

plt.title(f"Estimated number of clusters: {k_af}")
plt.show()

In [None]:
# DBSCAN

# YOUR CODE HERE

dbs = None              # define and train
labels = None           # get labels

k_dbs = 0               # calculate clusters
n_noise_ = 0            # return noisy data

# ---

print(f"Estimated number of clusters: {k_dbs}")
print(f"Estimated number of noise points: {n_noise_}")


In [None]:
# visualize
import seaborn as sns
colors = sns.color_palette("tab10")

for c, col in zip(range(k_dbs), colors):
    x = X[:, 0][labels == c]
    y = X[:, 1][labels == c]
    plt.scatter(x, y, s=20)
plt.scatter(X[:, 0][labels == -1], X[:, 1][labels == -1], s=20, color="#666666", alpha=0.5)
plt.title(f"DBSCAN finds k={k} clusters and {n_noise_} noise points")
plt.show()


In [None]:
print("\nNumber of clusters found in IRIS dataset\n")
print( "            |        k-Means       | Affinity    | DBSCAN")
print( "            | Silhouette | Inertia | Propagation |       ")
print( "------------+------------+---------+-------------+-------")
print(f"# clusters  |      {best_k:0.0f}     |    {best_k_inert:0.0f}    |      {k_af}      |   {k_dbs}")
print("------------+------------+---------+-------------+-------")
