# CME538 - Introduction to Data Science
## Lecture 10.2 - Clustering

### Lecture Structure
1. [Create Dummy Dataset](#section1)
2. [Interactive k-means Visualizer](#section2)
3. [Import Iris Dataset](#section3)
4. [k-means in Scikit-Learn](#section4)
5. [Feature Scaling](#section5)
6. [Minimizing Inertia](#section6)
7. [Hierarchical Clustering in Scikit-Learn](#section7)
8. [Picking the Number of Clusters](#section8)

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from copy import deepcopy
from sklearn import datasets
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from ipywidgets import interact, fixed, IntSlider
from sklearn.metrics import pairwise_distances_argmin

# Configure Notebook
%matplotlib inline
plt.style.use('fivethirtyeight')
sns.set_context("notebook")
import warnings
warnings.filterwarnings('ignore')

<a id='section1'></a>
# Create Dummy Dataset
### Create Blobs

In [None]:
def create_dataset(centers, cluster_std, n_samples):

    # Create Data
    features, true_labels = make_blobs(n_samples=n_samples,
                                       centers=centers,
                                       cluster_std=cluster_std,
                                       random_state=42)

    # Create DataFrame
    data = pd.DataFrame(data=features, columns=['Feature 1', 'Feature 2'])
    data['Cluster ID'] = true_labels
    data['Cluster ID'] = data['Cluster ID'].astype('str')

    return data
    
data = create_dataset(centers=3, cluster_std=2.75, n_samples=200)
data.head()

### Plot Blobs

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(7, 7))
fig.subplots_adjust(wspace=0.15)
sns.scatterplot(data=data, x='Feature 1', y='Feature 2', hue='Cluster ID')
ax.set_xlabel('Feature 1', fontsize=16)
ax.set_ylabel('Feature 2', fontsize=16)
ax.set_xlim([-17, 17])
ax.set_ylim([-17, 17]);

<a id='section2'></a>
# Interactive k-means Visualizer

In [None]:
def initiate_centers(num_clusters):
    x = np.random.randint(-10, 10, size=num_clusters)
    y = np.random.randint(-10, 10, size=num_clusters)
    return np.array(list(zip(x, y)), dtype=np.float32)

def draw_centers(num_clusters, ax, centers, factor=1, alpha=1.0):
    colors = ['#008fd5', '#fc4f30', '#e5ae38', '#6d904f', '#8b8b8b', '#810f7c']
    ax.scatter(centers[:, 0], centers[:, 1], c=colors[0:num_clusters], s=200 * factor, alpha=alpha)
    ax.scatter(centers[:, 0], centers[:, 1], c='black', s=50 * factor, alpha=alpha)
    
def distance(centers, center_old):
    return np.linalg.norm(centers - center_old, axis=1)

def ssd(center, data):
    return np.sum(np.power(center - data, 2))

def kmeans_step_plot(steps, operations, num_clusters, random_seed, data):
    
    np.random.seed(random_seed)
    colors = ['#008fd5', '#fc4f30', '#e5ae38', '#6d904f', '#8b8b8b', '#810f7c']
    centers = initiate_centers(num_clusters)
    centers_old = np.zeros(centers.shape)
    cluster_labels = np.zeros(data.shape[0], dtype=int)
    diffs = []
    losses = []
    
    for step, operation in zip(range(steps), operations):
        
        # Assigning each value to its closest cluster
        for i in range(data.shape[0]):
            distances = distance(data[i], centers)
            cluster_labels[i] = np.argmin(distances)
        
        if operation == 1:
            
            # Store the old centers
            centers_old = deepcopy(centers)

            # Find the new centers and compute loss
            loss = 0
            for i in range(num_clusters):
                points = [data[j] for j in range(data.shape[0]) if cluster_labels[j] == i]
                if len(points) > 0:
                    centers[i] = np.mean(points, axis=0)
                    loss += ssd(centers[i], points)
                else:
                    loss += losses[-1]
            losses.append(loss)
            
            # Update the error
            diffs.append(np.sum(distance(centers, centers_old)))
    
    
    # Setup figure
    fig = plt.figure(figsize=(15, 7))
    fig.subplots_adjust(wspace=0.22, hspace=0.3)
    ax1 = plt.subplot2grid((2, 2), (0, 0), rowspan=2)
    ax2 = plt.subplot2grid((2, 2), (0, 1))
    ax3 = plt.subplot2grid((2, 2), (1, 1))
    
    # Scatter Plot
    if steps == 0 or operation == 1:
        sns.scatterplot(x=data[:, 0], y=data[:, 1], color='#8b8b8b', ax=ax1)
    else:
        sns.scatterplot(x=data[:, 0], y=data[:, 1], hue=cluster_labels, palette=colors[0:num_clusters], ax=ax1)
    draw_centers(num_clusters, ax1, centers, factor=1, alpha=1.0)
    ax1.set_xlabel('Feature 1', fontsize=20)
    ax1.set_ylabel('Feature 2', fontsize=20)
    ax1.set_xlim([-17, 17])
    ax1.set_ylim([-17, 17])
    ax1.xaxis.set_tick_params(labelsize=16)
    ax1.yaxis.set_tick_params(labelsize=16)
    
    # Update difference
    ax2.plot(np.arange(steps//2)+1, diffs, '-o')
    ax2.set_xlim([0.8, 10.2])
    ax2.set_ylim([-2, 25])
    ax2.set_xlabel('Step', fontsize=20)
    ax2.set_ylabel('Difference', fontsize=20)
    ax2.xaxis.set_tick_params(labelsize=16)
    ax2.yaxis.set_tick_params(labelsize=16)
    
    # Update loss
    ax3.plot(np.arange(steps//2)+1, losses, '-o')
    ax3.set_xlim([0.8, 10.2])
    ax3.set_ylim([0, 15000])
    ax3.set_xlabel('Step', fontsize=20)
    ax3.set_ylabel('Sum of Squared Differences', fontsize=20)
    ax3.xaxis.set_tick_params(labelsize=16)
    ax3.yaxis.set_tick_params(labelsize=16)

In [None]:
# Launch widget
interact(kmeans_step_plot,
         steps=IntSlider(value=0, min=0, max=21, step=1),
         operations=fixed([0, 1] * 10 + [0]),
         num_clusters=IntSlider(value=3, min=1, max=4, step=1),
         random_seed=IntSlider(value=0, min=1, max=25, step=1),
         data=fixed(data[['Feature 1', 'Feature 2']].to_numpy()));

<a id='section3'></a>
# Import Iris Dataset
This data sets consists of 3 different types of irises’ (`Setosa`, `Versicolour`, and `Virginica`) petal and sepal length, stored in a 150x4 numpy.ndarray.

Let's import the dataset.

In [None]:
iris = datasets.load_iris()
iris = pd.DataFrame(data=iris['data'], columns=iris['feature_names'])
iris.head()

Now, Let's plot the `'Petal Length'` and `'Petal Width'`.

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(6, 6))
fig.subplots_adjust(wspace=0.15)
sns.scatterplot(data=iris, x='petal length (cm)', y='petal width (cm)')
ax.set_xlabel('Petal Length (cm)', fontsize=16)
ax.set_ylabel('Petal Width (cm)', fontsize=16);

<a id='section4'></a>
# k-means in Scikit-Learn
Let's work with the `Iris` dataset.

First, we'll need to scale our input features. We've done this previously when training linear and logistic regression models. We'll explain in the next section why this is important specifically for `k-means`. You'll notice that there are four features in `iris`. We'll just work with two features `'Petal Length'` and `'Petal Width'`.

In [None]:
scaler = StandardScaler()
iris_scaled = scaler.fit_transform(iris[['petal length (cm)', 
                                         'petal width (cm)']])

Next, let's fit the modeling using Scikit-Learn's `KMeans`.

In [None]:
kmeans = KMeans(n_clusters=2)
kmeans.fit(iris_scaled)

Next, let's look at some of the attributes/outputs of the model.

`.inertia_`: Sum of squared distances of samples to their closest cluster center.

In [None]:
kmeans.inertia_

`cluster_centers_`: Coordinates of cluster centers.

In [None]:
kmeans.cluster_centers_

`labels_`: Labels of each point.

In [None]:
kmeans.labels_

`n_iter_`: Number of iterations run.

In [None]:
kmeans.n_iter_

Lastly, let's plot the data and the cluster labels assigned by `KMeans`.

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(6, 6))
fig.subplots_adjust(wspace=0.15)
sns.scatterplot(data=iris, x='petal length (cm)', y='petal width (cm)', hue=kmeans.labels_)
ax.legend(loc=2, fontsize=14)
ax.set_xlabel('Petal Length (cm)', fontsize=16)
ax.set_ylabel('Petal Width (cm)', fontsize=16);

<a id='section5'></a>
# Feature Scaling
As you notices in the previous section, we used the 

In [None]:
# Create dummy data
x = np.random.randn(1000) * 100  
y = np.concatenate([np.random.randn(500), np.random.randn(500) + 5])

# Plot data
fig = plt.figure(figsize=(6, 10))
fig.subplots_adjust(hspace=0.4)
ax1 = plt.subplot2grid((3, 1), (0, 0))
ax2 = plt.subplot2grid((3, 1), (1, 0))
ax3 = plt.subplot2grid((3, 1), (2, 0))

ax1.set_title('Dataset', fontsize=16)
sns.scatterplot(x=x, y=y, ax=ax1)
ax1.set_xlabel('x')
ax1.set_ylabel('y')

clusters = KMeans(2).fit_predict(np.array([x, y]).T)
ax2.set_title('Non-normalised k-means', fontsize=16)
sns.scatterplot(x=x, y=y, hue=clusters, ax=ax2)
ax2.set_xlabel('x')
ax2.set_ylabel('y')

clusters = KMeans(2).fit_predict(np.array([x / 100, y]).T)
ax3.set_title('Normalised k-means', fontsize=16)
sns.scatterplot(x=x, y=y, hue=clusters, ax=ax3)
ax3.set_xlabel('x')
ax3.set_ylabel('y');

<a id='section6'></a>
# Minimizing Inertia

In [None]:
n_samples=200
blobs1 = 4
blobs_std1 = 1.5
clusters1 = 6
blobs2 = 4
blobs_std2 = 5
clusters2 = 2
colors = ['#008fd5', '#fc4f30', '#e5ae38', '#6d904f', '#8b8b8b', '#810f7c']
runs = 200

# Setup figure
fig = plt.figure(figsize=(12, 8))
fig.subplots_adjust(wspace=0.22, hspace=0.3)
ax1 = plt.subplot2grid((3, 2), (0, 0))
ax2 = plt.subplot2grid((3, 2), (1, 0), rowspan=2)
ax3 = plt.subplot2grid((3, 2), (0, 1))
ax4 = plt.subplot2grid((3, 2), (1, 1), rowspan=2)

# Model 1
data1 = create_dataset(centers=blobs1, cluster_std=blobs_std1, n_samples=n_samples)
inertias1 = []
ax1.set_title('Clusters: {}\nBlobs: {}'.format(clusters1, blobs1), fontsize=14, loc='right')
sns.scatterplot(data=data1, x='Feature 1', y='Feature 2', color='#8b8b8b', ax=ax2)
for _ in range(runs):
    model1 = KMeans(n_clusters=clusters1).fit(data1)
    inertias1.append(model1.inertia_)
    draw_centers(clusters1, ax2, model1.cluster_centers_, factor=1, alpha=1.0)
ax2.set_xlabel('Feature 1', fontsize=14)
ax2.set_ylabel('Feature 2', fontsize=14)
sns.distplot(inertias1, ax=ax1, kde=False)
ax1.set_xlabel('Sum of Squared Differences', fontsize=14)
ax1.set_ylabel('Probability Density', fontsize=14)

# Model 2
data2 = create_dataset(centers=blobs2, cluster_std=blobs_std2, n_samples=n_samples)
inertias2 = []
ax3.set_title('Clusters: {}\nBlobs: {}'.format(clusters2, blobs2), fontsize=14, loc='right')
sns.scatterplot(data=data2, x='Feature 1', y='Feature 2', color='#8b8b8b', ax=ax4)
for _ in range(runs):
    model2 = KMeans(n_clusters=clusters2).fit(data2)
    inertias2.append(model2.inertia_)
    draw_centers(clusters2, ax4, model2.cluster_centers_, factor=1, alpha=1.0)
ax4.set_xlabel('Feature 1', fontsize=14)
ax4.set_ylabel('Feature 2', fontsize=14)
sns.distplot(inertias2, ax=ax3, kde=False)
ax3.set_xlabel('Sum of Squared Differences', fontsize=14)
ax3.set_ylabel('Probability Density', fontsize=14);

Luckily, the good folks at `Scikit-Learn` have thought about this. The `KMeans` object has a parameter called `n_init`, which is the number of time the `k-means` algorithm will be run with different centroid seeds. The final results will be the best output of `n_init` consecutive runs in terms of inertia. The default is 10, but as we saw above, you may want to increase this to ensure you get the optimal output.

Here we are running 100 `k-means` cluster runs with different random initializations.

In [None]:
kmeans = KMeans(n_clusters=2,
                n_init=100)
kmeans.fit(iris_scaled)

<a id='section7'></a>
# Hierarchical Clustering in Scikit-Learn
Unlike k-means, hierarchical clustering doesn’t require the user to specify the number of clusters beforehand. Instead it returns an output (typically as a dendrogram), from which the user can decide the appropriate number of clusters (either manually or algorithmically).

Let's work with the `Iris` dataset.

First, we'll need to scale our input features. We've done this previously when training linear and logistic regression models. We'll explain in the next section why this is important specifically for `k-means`. You'll notice that there are four features in `iris`. We'll just work with two features `'Petal Length'` and `'Petal Width'`.

In [None]:
scaler = StandardScaler()
iris_scaled = scaler.fit_transform(iris[['petal length (cm)', 
                                         'petal width (cm)']])

Next, let's fit the modeling using Scikit-Learn's `AgglomerativeClustering`.

In [None]:
agglomerative = AgglomerativeClustering(n_clusters=2, 
                                        affinity='euclidean', 
                                        linkage='average')
agglomerative.fit(iris_scaled)

Next, let's look at some of the attributes/outputs of the model.

`labels_`: Labels of each point.

In [None]:
agglomerative.labels_

`n_iter_`: Number of iterations run.

Lastly, let's plot the data and the cluster labels assigned by `AgglomerativeClustering`.

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(6, 6))
fig.subplots_adjust(wspace=0.15)
sns.scatterplot(data=iris, x='petal length (cm)', y='petal width (cm)', hue=agglomerative.labels_)
ax.legend(loc=2, fontsize=14)
ax.set_xlabel('Petal Length (cm)', fontsize=16)
ax.set_ylabel('Petal Width (cm)', fontsize=16);

<a id='section8'></a>
# Picking the Number of Clusters
## Elbow Method
Let's work with the Iris data again.

In [None]:
scaler = StandardScaler()
iris_scaled = scaler.fit_transform(iris[['petal length (cm)', 
                                         'petal width (cm)']])

Now, let's loop through clusters from 1 to 10 and run k-means.

In [None]:
models = []
for clusters in range(1, 11):
    kmeans = KMeans(n_clusters=clusters, n_init=100)
    kmeans.fit(iris_scaled)
    models.append(kmeans)

In [None]:
plt.plot(np.arange(1, 11), [model.inertia_ for model in models], 'o-')
plt.xlabel('Number of Clusters', fontsize=16)
plt.ylabel('Inertia', fontsize=16);