In [2]:
!pip install hdbscan


Defaulting to user installation because normal site-packages is not writeable


In [4]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_moons, make_circles
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import (KMeans, MiniBatchKMeans, DBSCAN, AgglomerativeClustering,
                             MeanShift, SpectralClustering, AffinityPropagation, Birch, OPTICS)
from sklearn.mixture import GaussianMixture
import hdbscan
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import seaborn as sns
import pandas as pd
import time
import os


ModuleNotFoundError: No module named 'hdbscan'

In [None]:
def generate_datasets(n_samples=1500, random_state=42):
    X_moons, y_moons = make_moons(n_samples=n_samples//2, noise=0.05, random_state=random_state)
    X_circles, y_circles = make_circles(n_samples=n_samples//2, factor=0.5, noise=0.05, random_state=random_state)
    X_combined = np.vstack([X_moons, X_circles])
    y_combined = np.hstack([y_moons, y_circles + 2])
    return {
        'moons': (X_moons, y_moons),
        'circles': (X_circles, y_circles),
        'combined_moons_circles': (X_combined, y_combined)
    }

datasets = generate_datasets()


In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
for ax, (name, (X, y)) in zip(axes, datasets.items()):
    ax.scatter(X[:, 0], X[:, 1], c=y, cmap='viridis', s=40)
    ax.set_title(name.capitalize())
    ax.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()


In [None]:
def get_clustering_algorithms():
    return {
        'KMeans': KMeans(n_clusters=3, random_state=42),
        'MiniBatchKMeans': MiniBatchKMeans(n_clusters=3, random_state=42),
        'AffinityPropagation': AffinityPropagation(random_state=42),
        'MeanShift': MeanShift(bandwidth=2),
        'SpectralClustering': SpectralClustering(n_clusters=3, random_state=42),
        'Ward': AgglomerativeClustering(n_clusters=3, linkage='ward'),
        'AgglomerativeClustering': AgglomerativeClustering(n_clusters=3),
        'DBSCAN': DBSCAN(eps=0.3, min_samples=5),
        'HDBSCAN': hdbscan.HDBSCAN(min_cluster_size=15),
        'OPTICS': OPTICS(min_samples=5, xi=0.05, min_cluster_size=0.1),
        'Birch': Birch(n_clusters=3),
        'GaussianMixture': GaussianMixture(n_components=3, random_state=42)
    }

algorithms = get_clustering_algorithms()


In [None]:
def evaluate_clustering(X, labels):
    if len(np.unique(labels)) < 2:
        return None, None, None
    silhouette = silhouette_score(X, labels)
    calinski = calinski_harabasz_score(X, labels)
    davies = davies_bouldin_score(X, labels)
    return silhouette, calinski, davies


In [None]:
all_results = []

for dataset_name, (X, y_true) in datasets.items():
    print(f"\nProcessing: {dataset_name}")
    X_scaled = StandardScaler().fit_transform(X)
    
    for algo_name, algorithm in algorithms.items():
        print(f"  -> {algo_name}")
        start = time.time()
        try:
            if algo_name == 'MeanShift':
                algorithm.fit(X_scaled)
                labels = algorithm.labels_
            else:
                labels = algorithm.fit_predict(X_scaled)
            end = time.time()

            silhouette, calinski, davies = evaluate_clustering(X_scaled, labels)

            all_results.append({
                'Dataset': dataset_name,
                'Algorithm': algo_name,
                'Time (s)': end - start,
                'Clusters': len(np.unique(labels)),
                'Silhouette': silhouette,
                'Calinski': calinski,
                'Davies': davies
            })
        except Exception as e:
            print(f"Error with {algo_name}: {e}")


In [None]:
results_df = pd.DataFrame(all_results)
results_df.fillna("N/A", inplace=True)
results_df.head()


In [None]:
for dataset_name in datasets.keys():
    subset = results_df[results_df['Dataset'] == dataset_name]
    fig, axs = plt.subplots(3, 1, figsize=(14, 12))

    sns.barplot(data=subset, x='Algorithm', y='Silhouette', ax=axs[0], palette='Blues_d')
    axs[0].set_title(f'Silhouette Score - {dataset_name}')
    axs[0].tick_params(axis='x', rotation=45)

    sns.barplot(data=subset, x='Algorithm', y='Calinski', ax=axs[1], palette='Greens_d')
    axs[1].set_title(f'Calinski-Harabasz Score - {dataset_name}')
    axs[1].tick_params(axis='x', rotation=45)

    sns.barplot(data=subset, x='Algorithm', y='Davies', ax=axs[2], palette='Reds_d')
    axs[2].set_title(f'Davies-Bouldin Score - {dataset_name} (lower is better)')
    axs[2].tick_params(axis='x', rotation=45)

    plt.tight_layout()
    plt.show()


In [None]:
normalized = results_df.copy()
for dataset in normalized['Dataset'].unique():
    mask = normalized['Dataset'] == dataset
    normalized.loc[mask, 'Silhouette'] /= normalized.loc[mask, 'Silhouette'].max()
    normalized.loc[mask, 'Calinski'] /= normalized.loc[mask, 'Calinski'].max()
    normalized.loc[mask, 'Davies'] = normalized.loc[mask, 'Davies'].min() / normalized.loc[mask, 'Davies']
    
normalized['Combined_Score'] = (normalized['Silhouette'] + normalized['Calinski'] + normalized['Davies']) / 3

plt.figure(figsize=(16, 6))
sns.barplot(data=normalized, x='Dataset', y='Combined_Score', hue='Algorithm', palette='viridis')
plt.title("Best Algorithm per Dataset (Combined Score)")
plt.xticks(rotation=45)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

# Display best algorithms
for dataset in normalized['Dataset'].unique():
    best = normalized[normalized['Dataset'] == dataset].nlargest(1, 'Combined_Score')
    print(f"{dataset}: {best['Algorithm'].values[0]} (Score: {best['Combined_Score'].values[0]:.3f})")
