# Week 9 - Clustering
## Exercises

In [2]:
import pandas as pd

data = pd.DataFrame([[1,4],[1,3],[0,4],[5,1],[6,2],[4,0]], columns = ["x_1", "x_2"])
data.index.name = "Obs."
#print(data.to_html())

## Question X.

This question asks you to manually perform K-means clustering manually, with $K=2$, on a small sample, $n=6$, and $p=2$ features.

<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>x_1</th>
      <th>x_2</th>
    </tr>
    <tr>
      <th>Obs.</th>
      <th></th>
      <th></th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>1</td>
      <td>4</td>
    </tr>
    <tr>
      <th>1</th>
      <td>1</td>
      <td>3</td>
    </tr>
    <tr>
      <th>2</th>
      <td>0</td>
      <td>4</td>
    </tr>
    <tr>
      <th>3</th>
      <td>5</td>
      <td>1</td>
    </tr>
    <tr>
      <th>4</th>
      <td>6</td>
      <td>2</td>
    </tr>
    <tr>
      <th>5</th>
      <td>4</td>
      <td>0</td>
    </tr>
  </tbody>
</table>

a. Plot the observations

In [5]:
import matplotlib.pyplot as plt

x1 = [1,1,0,5,6,4]
x2 = [4,3,4,1,2,0]
fig = plt.figure(figsize=(15,8))
plt.grid(linestyle='--', alpha=0.3)
plt.scatter(x1, x2, c='blue', s=150)
plt.xlabel('$\mathbf{x}_1$', size=20)
plt.ylabel('$\mathbf{x}_2$', size=20, rotation=0)
plt.savefig("Images/Exercises/a_plot.png")
plt.close()

<details><summary>Click here for answer</summary>
<img src="Images/Exercises/a_plot.png">
</details>

b. Randomly assign a cluster label to each observation. In Python you can use <code>np.random.randint</code>. Report the cluster labels for each observation.

In [7]:
import numpy as np

np.random.seed(42)
cluster_labels = np.random.randint(2, size=6)
data['labels'] = cluster_labels
#print(data.to_html())

<details><summary>Click here for answer</summary>
    <table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>x_1</th>
      <th>x_2</th>
      <th>labels</th>
    </tr>
    <tr>
      <th>Obs.</th>
      <th></th>
      <th></th>
      <th></th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>1</td>
      <td>4</td>
      <td>0</td>
    </tr>
    <tr>
      <th>1</th>
      <td>1</td>
      <td>3</td>
      <td>1</td>
    </tr>
    <tr>
      <th>2</th>
      <td>0</td>
      <td>4</td>
      <td>0</td>
    </tr>
    <tr>
      <th>3</th>
      <td>5</td>
      <td>1</td>
      <td>0</td>
    </tr>
    <tr>
      <th>4</th>
      <td>6</td>
      <td>2</td>
      <td>0</td>
    </tr>
    <tr>
      <th>5</th>
      <td>4</td>
      <td>0</td>
      <td>1</td>
    </tr>
  </tbody>
</table>
    
These labels, and subsequent answers, will depend on the random seed you set (I used <code>np.random.seed(42)</code>)
</details>

In [8]:
color= ['red' if l == 0 else 'green' for l in cluster_labels]
fig = plt.figure(figsize=(15,8))
plt.grid(linestyle='--', alpha=0.3)
plt.scatter(x1, x2, c=color, s=150)
plt.xlabel('$\mathbf{x}_1$', size=20)
plt.ylabel('$\mathbf{x}_2$', size=20, rotation=0)
plt.savefig("Images/Exercises/b_plot.png")
plt.close()

<details><summary>Click here for plot</summary>
    <img src="Images/Exercises/b_plot.png">
</details>

c. Compute the centroid for each cluster.

<details><summary>Click here for working out</summary>
    We can compute the centroid for the red cluster with
    $$
    \begin{align}
    \bar x_{11} & = \frac{1}{4}(0+1+5+6) \\ 
                & = 3 \\
    \\
    \bar x_{12} & = \frac{1}{4}(4+4+2+1) \\ 
                & = 2.75 \\
    \\
    \end{align}
    $$
    and the green cluster with
    $$
    \begin{align}
    \bar x_{21} & = \frac{1}{2}(1+4) \\
                & = 2.5   \\
    \\
    \bar x_{22} & = \frac{1}{2}(3+0) \\ 
                & = 1.5 \\
    \\
    \end{align}
    $$
</details>

In [9]:
mean_data = data.groupby('labels').mean()

fig = plt.figure(figsize=(15,8))
plt.grid(linestyle='--', alpha=0.3)
plt.scatter(x1, x2, c=color, s=150)
plt.scatter(mean_data.loc[0]["x_1"], mean_data.loc[0]["x_2"], c='red', s=150, marker = 'x', linewidths=4)
plt.scatter(mean_data.loc[1]["x_1"], mean_data.loc[1]["x_2"], c='green', s=150, marker = 'x', linewidths=4)
plt.xlabel('$\mathbf{x}_1$', size=20)
plt.ylabel('$\mathbf{x}_2$', size=20, rotation=0)
plt.savefig("Images/Exercises/c_plot.png")
plt.close()

<details><summary>Click here for plot</summary>
    <img src="Images/Exercises/c_plot.png">
</details>

d. Assign each observation to the centroid to which it is closest, in terms of Euclidean distance. Report the cluster labels for each observation.

In [11]:
for idx, cluster in enumerate(cluster_labels):
    dist_0 = (x1[idx] - mean_data.loc[0]["x_1"])**2 + (x2[idx] - mean_data.loc[0]["x_2"])**2
    dist_1 = (x1[idx] - mean_data.loc[1]["x_1"])**2 + (x2[idx] - mean_data.loc[1]["x_2"])**2
    if dist_0 > dist_1:
        cluster_labels[idx] = 0
    else:
        cluster_labels[idx] = 1
        
data['labels'] = cluster_labels
#print(data.to_html())

<details><summary>Click here for answer</summary>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>x_1</th>
      <th>x_2</th>
      <th>labels</th>
    </tr>
    <tr>
      <th>Obs.</th>
      <th></th>
      <th></th>
      <th></th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>1</td>
      <td>4</td>
      <td>1</td>
    </tr>
    <tr>
      <th>1</th>
      <td>1</td>
      <td>3</td>
      <td>1</td>
    </tr>
    <tr>
      <th>2</th>
      <td>0</td>
      <td>4</td>
      <td>1</td>
    </tr>
    <tr>
      <th>3</th>
      <td>5</td>
      <td>1</td>
      <td>0</td>
    </tr>
    <tr>
      <th>4</th>
      <td>6</td>
      <td>2</td>
      <td>1</td>
    </tr>
    <tr>
      <th>5</th>
      <td>4</td>
      <td>0</td>
      <td>0</td>
    </tr>
  </tbody>
</table>
</details>

In [151]:
color= ['red' if l == 0 else 'green' for l in cluster_labels]
mean_data = data.groupby('labels_2').mean()
fig = plt.figure(figsize=(15,8))
plt.grid(linestyle='--', alpha=0.3)
plt.scatter(x1, x2, c=color, s=150)
plt.scatter(mean_data.loc[0]["x_1"], mean_data.loc[0]["x_2"], c='red', s=150, marker = 'x', linewidths=4)
plt.scatter(mean_data.loc[1]["x_1"], mean_data.loc[1]["x_2"], c='green', s=150, marker = 'x', linewidths=4)
plt.xlabel('$\mathbf{x}_1$', size=20)
plt.ylabel('$\mathbf{x}_2$', size=20, rotation=0)
plt.savefig("Images/Exercises/d_plot.png")
plt.close()

<details><summary>Click here for plot</summary>
    <img src="Images/Exercises/d_plot.png">
</details>

e. Find where the k-means cluster centers (e.g. where questions (c) and (d) stops changing), then color your plot according to these cluster labels.

In [20]:
from sklearn.cluster import KMeans

M = np.column_stack((x1,x2))
kmeans = KMeans(n_clusters=2, random_state=0).fit(M)
cluster_labels = kmeans.labels_

color= ['red' if l == 0 else 'green' for l in cluster_labels]
fig = plt.figure(figsize=(15,8))
plt.scatter(x1, x2, c=color, s=150)
plt.scatter(kmeans.cluster_centers_[0,0], kmeans.cluster_centers_[0,1], c='red', s=150, marker = 'x', linewidths=4)
plt.scatter(kmeans.cluster_centers_[1,0], kmeans.cluster_centers_[1,1], c='green', s=150, marker = 'x', linewidths=4)
plt.xlabel('$\mathbf{x}_1$', size=20)
plt.ylabel('$\mathbf{x}_2$', size=20, rotation=0)
plt.grid(linestyle='--', alpha=0.3)
plt.savefig("Images/Exercises/e_plot.png")
plt.close()

<details><summary>Click here for plot</summary>
    <img src="Images/Exercises/e_plot.png">
</details>

## Question X.

Suppose we have a dissimilarity matrix as follows:

$$\begin{bmatrix} 
& 0.3 & 0.4 & 0.7 \\
0.3 & & 0.5 & 0.8 \\
0.4 & 0.5 & & 0.45 \\
0.7 & 0.8 & 0.45 & \\
\end{bmatrix}$$

This means the dissimilarity between the first and second observation is 0.3, second and fourth is 0.8 ect.

a. Sketch or code a diagram that results from hierarchically clustering these four observations using __complete__ linkage.

<details><summary>Click here for working out</summary>

Beginning with 
$$\begin{bmatrix} 
& 0.3 & 0.4 & 0.7 \\
0.3 & & 0.5 & 0.8 \\
0.4 & 0.5 & & 0.45 \\
0.7 & 0.8 & 0.45 & \\
\end{bmatrix}$$

we see that 0.3 is the minimum dissimilarity, so we fuse observations 1 and 2 to form the cluster (1,2) at height 0.3. This leaves our similarity matrix as:
$$\begin{bmatrix} 
& 0.5 & 0.8 \\
0.5 & & 0.45 \\
0.8 & 0.45 & \\
\end{bmatrix}$$
because we are recording the _largest_ dissimilarities. For example, looking at the similarities between observations 1, 2, and 3, in our original matrix, we see that the dissimilarity between 1 and 3 is 0.4, and between 2 and 3 is 0.5. We pick 0.5 because it is the larger of the two, and this now represents the dissimilarity between our cluster (1,2) and 3. If you think in dataframes like me this may help to see this new matrix as:
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>(1,2)</th>
      <th>3</th>
      <th>4</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>(1,2)</th>
      <td>0.00</td>
      <td>0.50</td>
      <td>0.80</td>
    </tr>
    <tr>
      <th>3</th>
      <td>0.50</td>
      <td>0.00</td>
      <td>0.45</td>
    </tr>
    <tr>
      <th>4</th>
      <td>0.80</td>
      <td>0.45</td>
      <td>0.00</td>
    </tr>
  </tbody>
</table>
Now we continue to do this. We see that the minimum dissimilarity is 0.45, so we fuse observations 3 and 4 to form cluster (3,4) at height 0.45. We now have the new dissimilarity matrix:
$$\begin{bmatrix} 
& 0.8 \\
0.8 & \\
\end{bmatrix}$$
This means all that is left to fuse is clusters (1,2) and (3,4) to form cluster ((1,2),(3,4)) at height 0.8.
    
</details>

In [60]:
import pandas as pd

# Using the answer above we could do this manually...
# --------
# Manually
# --------

columns =   ['row label 1', 'row label 2', 'distance', 'no. of items in clust.']
cluster_1 = [0.           , 1.           , 0.3       , 2.]
cluster_2 = [2.           , 3.           , 0.45      , 2.]
# note that newly formed clusters are represented as a new row label (hense 4 and 5)
cluster_3 = [4.           , 5.           , 0.8       , 4.]

data = pd.DataFrame([cluster_1,cluster_2,cluster_3],
             columns=columns,
             index=['cluster %d' % (i + 1)
                    for i in range(3)])
display(data.head())

Unnamed: 0,row label 1,row label 2,distance,no. of items in clust.
cluster 1,0.0,1.0,0.3,2.0
cluster 2,2.0,3.0,0.45,2.0
cluster 3,4.0,5.0,0.8,4.0


In [63]:
# ... or more simply
# ------
# Simple
# ------
import numpy as np

from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import squareform

import matplotlib.pyplot as plt

dis_mat = np.array([[0.0, 0.3, 0.4, 0.7], [0.3, 0.0, 0.5, 0.8], [0.4, 0.5, 0.0, 0.45], [0.7, 0.8, 0.45, 0.0]])
dists = squareform(dis_mat)
linkage_matrix = linkage(dists, "complete")
fig = plt.figure(figsize=(15,8))
dendrogram(linkage_matrix, labels=["1", "2", "3", "4"])
plt.title("Complete Linkage")
plt.savefig("Images/Exercises/complete_linkage.png")
plt.close()

<details><summary>Click here for diagram</summary>
<img src="Images/Exercises/complete_linkage.png">
</details>

b. Suppose we cut the dendogram from question (a) such that two there are two clusters, which observations are in which cluster?

<details><summary>Click here for answer</summary>
In this case, we have clusters (1,2) and (3,4).
</details>

c. Sketch or code a diagram that results from hierarchically clustering these four observations using __single__ linkage.

<details><summary>Click here for working out</summary>

Beginning with 
$$\begin{bmatrix} 
& 0.3 & 0.4 & 0.7 \\
0.3 & & 0.5 & 0.8 \\
0.4 & 0.5 & & 0.45 \\
0.7 & 0.8 & 0.45 & \\
\end{bmatrix}$$

we see that 0.3 is the minimum dissimilarity, so we fuse observations 1 and 2 to form the cluster (1,2) at height 0.3. This leaves our similarity matrix as:
$$\begin{bmatrix} 
& 0.4 & 0.7 \\
0.4 & & 0.45 \\
0.7 & 0.45 & \\
\end{bmatrix}$$
because we are recording the _smallest_ dissimilarities. 
    
We now see that the minimum dissimilarity is 0.4, so we fuse cluster (1,2) and observation 3 to form cluster ((1,2),3) at height 0.4. We now have the new dissimilarity matrix:
$$\begin{bmatrix} 
& 0.45 \\
0.45 & \\
\end{bmatrix}$$
It remains to fuse clusters ((1,2),3) and observation 4 to form cluster (((1,2),3),4) at height 0.45.
    
</details>

In [69]:
linkage_matrix = linkage(dists, "single")
fig = plt.figure(figsize=(15,8))
dendrogram(linkage_matrix, labels=["1", "2", "3", "4"])
plt.title("Single Linkage")
plt.savefig("Images/Exercises/single_linkage.png")
plt.close()

<details><summary>Click here for diagram</summary>
<img src="Images/Exercises/single_linkage.png">
</details>

d. Suppose we cut the dendogram from question (c) such that two there are two clusters, which observations are in which cluster?

<details><summary>Click here for answer</summary>
In this case, we have clusters ((1,2),3) and (4).
</details>

In [9]:
PDF = False

if PDF:
    # For pdf conversion
    !jupyter nbconvert Clustering_Exercises.ipynb \
        --to html \
        --output-dir ./PDF_Prep \
        --output Trees_Exercises_Answers \
        --template classic \
        --TemplateExporter.exclude_input=True
else:
    # Create HTML document - need to run it a few times
    # as its a little unpredictable
    !jupyter nbconvert Clustering_Exercises.ipynb \
        --to html \
        --output-dir . \
        --output Trees_Exercises \
        --template classic \
        --TemplateExporter.exclude_input=True

[NbConvertApp] Converting notebook Trees_Exercises.ipynb to html
[NbConvertApp] Writing 498016 bytes to Trees_Exercises.html
