# Clustering

Clustering is an unsupervised learning technique useful to find similarity amongst data.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

## Load and plot data

In [None]:
df = pd.read_csv('../data/iris.csv')

In [None]:
df.head()

In [None]:
plt.figure(figsize=(15,10))

plt.scatter(df.sepal_length_cm, df.petal_length_cm)
plt.title('Iris Flowers', size = 20)
plt.xlabel('Sepal Length', size = 20)
plt.ylabel('Petal Length', size = 20)

## Encode labels

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder()

In [None]:
target_n = le.fit_transform(df.target)

In [None]:
target_n

In [None]:
le.classes_

In [None]:
plt.figure(figsize=(15,10))

plt.scatter(df.sepal_length_cm, df.petal_length_cm, c=target_n)
plt.title('Iris Flowers')
plt.xlabel('Sepal Length')
plt.ylabel('Petal Length')
plt.legend(le.classes_, loc = 'best')

## Kmeans clustering

In [None]:
from sklearn.cluster import KMeans

X = df[['sepal_length_cm','sepal_width_cm','petal_length_cm','petal_width_cm']]
km = KMeans(3)
km.fit(X)

In [None]:
centers = km.cluster_centers_
centers

In [None]:
plt.figure(figsize = (14,6))
plt.subplot(121)
plt.scatter(df.sepal_length_cm, df.petal_length_cm, c=target_n)
plt.xlabel('Sepal Length (cm)')
plt.ylabel('Petal length (cm)')
plt.title('True Labels')

plt.subplot(122)
plt.scatter(df.sepal_length_cm, df.petal_length_cm, c=km.labels_)
plt.scatter(centers[:,0], centers[:,2], marker='o', c='r', s=100)
plt.xlabel('Sepal Length (cm)')
plt.ylabel('Petal length (cm)')
plt.title('K-Means Clusters')
plt.draw()
plt.show()


### Exercise 1
- discuss with your pair:
    - why do cluster centers have 4 coordinates?
    - do the colors in the two plots coincide? Why?
- change the number of clusters using the n_clusters parameter. What happens?
- change the initialization parameters of KMeans to 'random'. What happens?
- run the clustering multiple times, do the centroid positions change?

### Exercise 2
- calculate silhouette_score for different values of k. You will have to extract the labels from km at different values of k
- plot the silhouette score as a function of k for k between 2 and 10
- discuss with your pair:
    - what value of k gives the highest silhouette score?
    - did you expect that result?

In [None]:
from sklearn.metrics import silhouette_score

In [None]:
ks = range(2, 10)
scores = []
for k in ks:
    km = KMeans(k)
    km.fit(X)
    s = silhouette_score(X, km.labels_)
    scores.append(s)

plt.plot(ks, scores)

### Exercise 3

Try clustering with another method of the ones you find here: http://scikit-learn.org/stable/auto_examples/cluster/plot_cluster_comparison.html


In [None]:
from sklearn.cluster import DBSCAN

In [None]:
model = DBSCAN()
model.fit(X)

In [None]:
plt.figure(figsize = (14,6))
plt.subplot(121)
plt.scatter(df.sepal_length_cm, df.petal_length_cm, c=target_n)
plt.xlabel('Sepal Length (cm)')
plt.ylabel('Petal length (cm)')
plt.title('True Labels')

plt.subplot(122)
plt.scatter(df.sepal_length_cm, df.petal_length_cm, c=model.labels_)
plt.xlabel('Sepal Length (cm)')
plt.ylabel('Petal length (cm)')
plt.title('DBSCAN Clusters')
plt.draw()
plt.show()


*Copyright &copy; 2017 CATALIT LLC.  All rights reserved.*