This exercise is adopted from [this](https://joernhees.de/blog/2015/08/26/scipy-hierarchical-clustering-and-dendrogram-tutorial/) tutorial.

In [None]:
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
import seaborn as sns
import numpy as np

In [None]:
# generate two clusters: a with 100 points, b with 50:

np.random.seed(4711)

a = np.random.multivariate_normal([10, 0], [[3, 1], [1, 4]], size=[100,])
b = np.random.multivariate_normal([0, 20], [[3, 1], [1, 4]], size=[50,])

X = np.concatenate((a, b),)

print(X.shape)

In [None]:
# visualize data

plt.figure(figsize=(6, 6))

sns.scatterplot(X[:,0], X[:,1], s=80)
plt.show();

In [None]:
# generate the linkage matrix

Z = linkage(X, 'ward')

In [None]:
Z.shape

In [None]:
# the first merge

Z[0]

The format is [idx1, idx2, dist, sample_count]. Here, indices 52 and 53 had a distance of 0.04151 between them, and they were merged to create a cluster with a total of 2 samples.

In [None]:
# the first 20 merges

np.set_printoptions(suppress=True)

Z[:20]

In [None]:
# plot dendrogram

plt.figure(figsize=(25, 15))

dendrogram(Z, leaf_rotation=90, leaf_font_size=8)

plt.title('Hierarchical Clustering Dendrogram', fontsize=14)
plt.xlabel('Sample index', fontsize=14)
plt.ylabel('Distance', fontsize=14)
plt.show()

In [None]:
# the last four merges

Z[-4:]

In [None]:
# truncated dendrogram

plt.figure(figsize=(9, 6))

dendrogram(Z, 
           truncate_mode='lastp',  # show only the last p merged clusters
           p=12,  # show only the last p merged clusters
           leaf_rotation=90,
           leaf_font_size=12)

plt.title('Hierarchical Clustering Dendrogram', fontsize=12)
plt.xlabel('Sample index or cluster size)', fontsize=12)
plt.ylabel('Distance', fontsize=12)
plt.show()

In [None]:
# set cut-off to 50

max_d = 50  # max_d as in max_distance

dendrogram(Z, 
           truncate_mode='lastp',  # show only the last p merged clusters
           p=12,  # show only the last p merged clusters
           leaf_rotation=90,
           leaf_font_size=12)

plt.title('Hierarchical Clustering Dendrogram', fontsize=12)
plt.xlabel('Sample index or cluster size)', fontsize=12)
plt.ylabel('Distance', fontsize=12)

plt.axhline(y=max_d, color='black', linestyle='--')

plt.show()

This cut-off value would give us two clusters.

In [None]:
# set cut-off to 16

max_d = 16

dendrogram(Z, 
           truncate_mode='lastp',  # show only the last p merged clusters
           p=12,  # show only the last p merged clusters
           leaf_rotation=90,
           leaf_font_size=12)

plt.title('Hierarchical Clustering Dendrogram', fontsize=12)
plt.xlabel('Sample index or cluster size)', fontsize=12)
plt.ylabel('Distance', fontsize=12)

plt.axhline(y=max_d, color='black', linestyle='--')

plt.show()

This cut-off value would give us four clusters.

In [None]:
# retrieve cluster numbers (assignments) based on max_d

from scipy.cluster.hierarchy import fcluster

max_d = 50

clusters = fcluster(Z, max_d, criterion='distance')

clusters

In [None]:
# retrieve cluster numbers (assignments) based on k

k = 2

fcluster(Z, k, criterion='maxclust')

In [None]:
# visualize clusters

plt.figure(figsize=(6, 6))

sns.scatterplot(X[:,0], X[:,1], hue=clusters, s=80, legend=False, palette=['royalblue', 'tomato'])

plt.show()