In [3]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import linkage, dendrogram
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score

def analyze_hierarchical_clustering(
    data_path,
    max_k=10,
    linkage_method='ward',
    affinity='euclidean',
    show_dendrogram=True,
    auto_cluster=True,
    return_model=False
):
    """
    Perform hierarchical clustering with optional dendrogram and silhouette analysis.

    Parameters:
        data_scaled (ndarray): Preprocessed (numeric & scaled) data
        max_k (int): Max number of clusters to test with silhouette
        linkage_method (str): 'ward', 'average', 'complete', 'single'
        affinity (str): Distance metric (ignored if 'ward')
        show_dendrogram (bool): Show dendrogram plot
        auto_cluster (bool): Choose best k automatically using silhouette score
        return_model (bool): If True, return clustering model as well

    Returns:
        labels (ndarray): Cluster labels
        best_k (int): Chosen number of clusters
        model (optional): Fitted AgglomerativeClustering model
    """
    data = preprocess(data_path)  
    # 1. Plot dendrogram
    if show_dendrogram:
        Z = linkage(data, method=linkage_method, metric=affinity)
        plt.figure(figsize=(10, 6))
        dendrogram(Z)
        plt.title(f"Dendrogram (Linkage: {linkage_method})")
        plt.xlabel("Sample Index")
        plt.ylabel("Distance")
        plt.tight_layout()
        plt.show()

    # 2. Determine best number of clusters
    if auto_cluster:
        print("Silhouette scores:")
        best_score = -1
        best_k = 2
        for k in range(2, max_k + 1):
            model = AgglomerativeClustering(n_clusters=k, linkage=linkage_method, affinity=affinity)
            labels = model.fit_predict(data)
            score = silhouette_score(data, labels)
            print(f"k={k} → Silhouette Score = {score:.3f}")
            if score > best_score:
                best_score = score
                best_k = k
    else:
        best_k = max_k  # user supplies the number directly

    # 3. Final clustering
    final_model = AgglomerativeClustering(n_clusters=best_k, linkage=linkage_method, affinity=affinity)
    final_labels = final_model.fit_predict(data)

    if return_model:
        return final_labels, best_k, final_model
    return final_labels, best_k

In [None]:
analyze_hierarchical_clustering('customer_info')