## Initialize some stuff:

In [None]:
%matplotlib inline
import numpy as np
from numpy.random import multivariate_normal as gauss
import matplotlib.pyplot as plt
from sklearn import cluster

figsize = 10, 5 # inches
colourset = np.array(['red', 'orange', 'green', 'blue', 'magenta', 'gray', 'brown', 'black']*10)

## Generate easy and hard 2D data sets, each is nsamples rows x 2 columns per sample:

In [None]:
np.random.seed(0)

mean = [[0, 0],
        [3, 3],
        [-2.5, 0]]

cov1 = [[[0.5, 0.2],
         [0.2, 0.2]],
        [[0.5, 0],
         [0, 0.5]],
        [[0.1, 0],
         [0, 0.1]]]

cov2 = [[[1, 0.2],
         [0.2, 0.2]],
        [[0.5, 0],
         [0, 2]],
        [[0.1, 0],
         [0, 2]]]

# easy data set:
g1 = gauss(mean[0], cov1[0], 300)
g2 = gauss(mean[1], cov1[1], 300)
g3 = gauss(mean[2], cov1[2], 300)
set1 = np.vstack([g1, g2, g3])
np.random.shuffle(set1)

# hard data set:
g4 = gauss(mean[0], cov2[0], 300)
g5 = gauss(mean[1], cov2[1], 600)
g6 = gauss(mean[2], cov2[2], 200)
set2 = np.vstack([g4, g5, g6])
np.random.shuffle(set2)

# plot them:
f, ax = plt.subplots(nrows=1, ncols=2, sharey=True, figsize=figsize) # create 1x2 panel figure
ax[0].set_title('set1')
ax[0].scatter(set1[:, 0], set1[:, 1], s=2) # plot x vs. y for all samples
ax[1].set_title('set2')
ax[1].scatter(set2[:, 0], set2[:, 1], s=2) # plot x vs. y for all samples

## Cluster with K-Means:

In [None]:
kmeans = cluster.KMeans(n_clusters=3) # create a KMeans object, tell it n_clusters you want
kmeans.fit(set1) # fit it to (run it on) the data
cids = kmeans.labels_ # get resulting cluster IDs from the kmeans object, one for each sample
colours = colourset[cids] # convert cluster IDs to colours
kmeans.fit(set2) # fit it to (run it on) the data
cids2 = kmeans.labels_ # get resulting cluster IDs from the kmeans object, one for each sample
colours2 = colourset[cids2] # convert cluster IDs to colours

# plot results:
f, ax = plt.subplots(nrows=1, ncols=2, sharey=True, figsize=figsize) # create 1x2 panel figure
ax[0].set_title('set1')
ax[0].scatter(set1[:, 0], set1[:, 1], s=2, color=colours) # plot x vs. y for all samples
ax[1].set_title('set2')
ax[1].scatter(set2[:, 0], set2[:, 1], s=2, color=colours2) # plot x vs. y for all samples

### Exercise 1:

What happens sometimes when you re-run k-means multiple times on the same data?

### Exercise 2:
Re-run k-means by telling it that you want 2 clusters instead of 3. Try it again with 4 or 5 or 6 clusters. What happens when you underestimate or overestimate the true number of clusters?

## Cluster with DBSCAN:

In [None]:
dbscan = cluster.DBSCAN(eps=0.5, min_samples=30) # create a DBSCAN object, set its two parameters
dbscan.fit(set1) # fit it to (run it on) the data
cids = dbscan.labels_ # get resulting cluster IDs from the kmeans object, one for each sample
nclust = len(np.unique(cids))
colours = colourset[cids] # convert cluster IDs to colours
dbscan.fit(set2) # fit it to (run it on) the data
cids2 = dbscan.labels_ # get resulting cluster IDs from the kmeans object, one for each sample
nclust2 = len(np.unique(cids2))
colours2 = colourset[cids2] # convert cluster IDs to colours

# plot results:
f, ax = plt.subplots(nrows=1, ncols=2, sharey=True, figsize=figsize) # create 1x2 panel figure
ax[0].set_title('set1: %d clusters' % nclust)
ax[0].scatter(set1[:, 0], set1[:, 1], s=2, color=colours) # plot x vs. y for all samples
ax[1].set_title('set2: %d clusters' % nclust2)
ax[1].scatter(set2[:, 0], set2[:, 1], s=2, color=colours2) # plot x vs. y for all samples

Note that outliers are labelled with `-1`, and show up as black.

### Exercise 3:

The `eps` parameter tells DBSCAN the maximum distance between neighbouring points in a cluster. The `min_samples` parameter tells DBSCAN roughly the minimum number of points in a cluster. Try changing both parameters. What happens when one of them is too high or low? Can you change them both while still getting a good clustering result?