# Hierarchical Clustering

In [None]:
from itertools import cycle, islice

import matplotlib.pyplot as plt
import numpy as np

from sklearn import cluster, datasets
from sklearn.preprocessing import StandardScaler

from scipy.cluster.hierarchy import dendrogram

## Select dataset

In [None]:
n_samples = 1500

dataset = ['circles', 'moons', 'blobs']
selection = 2

match dataset[selection]:
    case 'circles':
        X, y = datasets.make_circles(n_samples=n_samples, factor=0.5, noise=0.05, random_state=170)
    case 'moons':
        X, y = datasets.make_moons(n_samples=n_samples, noise=0.05, random_state=170)
    case 'blobs':
        X, y = datasets.make_blobs(n_samples=n_samples, random_state=170)

plt.scatter(X[:, 0], X[:, 1], s=10)
plt.title(dataset[selection]);

## Select linkage

### Single linkage
Uses the minimum of the distances between all observations of the two sets.

### Average linkage
Uses the average of the distances of each observation of the two sets.

### Complete linkage
Uuses the maximum distances between all observations of the two sets.

### Ward linkage
Minimizes the variance of the clusters being merged.

In [None]:
linkages = ['single', 'average', 'complete', 'ward']
linkage_names = ['Single linkage', 'Average linkage', 'Complete linkage', 'Ward linkage']
selection_linkage = 3
n_clusters = 3

## Plot clusters

In [None]:
X_standard = StandardScaler().fit_transform(X)
algorithm = cluster.AgglomerativeClustering(n_clusters=n_clusters, linkage=linkages[selection_linkage])
algorithm.fit(X_standard)
# y_pred = algorithm.predict(X)
y_pred = algorithm.labels_.astype(int)
colors = ["#377eb8", "#ff7f00", "#4daf4a", "#f781bf", "#a65628", "#984ea3", "#999999", "#e41a1c", "#dede00"]
plt.scatter(X[:, 0], X[:, 1], s=10, color=[colors[y_pred_single] for y_pred_single in y_pred])
plt.title(f'{dataset[selection]} - {linkage_names[selection_linkage]}');

## Plot dendogram

In [None]:
truncate_level = 2

def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)

algorithm = cluster.AgglomerativeClustering(distance_threshold=0, n_clusters=None, linkage=linkages[selection_linkage])
algorithm.fit(X_standard)
plt.figure(figsize=(10,4))
plot_dendrogram(algorithm, truncate_mode="level", p=truncate_level)
plt.xlabel("Number of points in node (or index of point if no parenthesis).")
plt.title(f'Dendogram - {dataset[selection]} - {linkage_names[selection_linkage]}, Truncate level = {truncate_level}');