In [None]:
from google.colab import files
uploaded = files.upload()

Saving ptbxl_database.csv to ptbxl_database.csv
Saving scp_statements.csv to scp_statements.csv


In [None]:
from google.colab import files
uploaded = files.upload()

Saving 01000_lr.dat to 01000_lr.dat
Saving 01000_lr.hea to 01000_lr.hea
Saving 01001_lr.dat to 01001_lr.dat


In [11]:
	# a5
"""
clustering_framework.py

Implements modular clustering with:
- KMeans
- Agglomerative (Hierarchical)
- DBSCAN
- OPTICS

Evaluation with silhouette, Davies-Bouldin, Calinski-Harabasz scores.
"""

import numpy as np
import pandas as pd
import warnings

from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN, OPTICS

warnings.filterwarnings("ignore")


# ---------------- Utility Functions ----------------
def load_dataset(kind="iris"):
    """
    Load a dataset for clustering.
    Replace with your CSV or real dataset.
    """
    if kind == "iris":
        data = load_iris()
        return data.data, data.feature_names
    else:
        raise ValueError("Unknown dataset kind. Implement your CSV loader here.")


def scale_data(X):
    scaler = StandardScaler()
    return scaler.fit_transform(X), scaler


def evaluate_clustering(X, labels):
    """
    Evaluate clustering using internal validation metrics.
    Returns a dict of scores.
    """
    # If clustering assigned all points to 1 cluster, scores fail
    if len(set(labels)) <= 1 or (np.unique(labels).shape[0] == 1):
        return {"Silhouette": np.nan, "DaviesBouldin": np.nan, "CalinskiHarabasz": np.nan}

    return {
        "Silhouette": silhouette_score(X, labels),
        "DaviesBouldin": davies_bouldin_score(X, labels),
        "CalinskiHarabasz": calinski_harabasz_score(X, labels),
    }


# ---------------- Clustering Algorithms ----------------
def run_kmeans(X, n_clusters=3, random_state=42):
    model = KMeans(n_clusters=n_clusters, random_state=random_state, n_init=10)
    labels = model.fit_predict(X)
    return model, labels


def run_agglomerative(X, n_clusters=3, linkage="ward"):
    model = AgglomerativeClustering(n_clusters=n_clusters, linkage=linkage)
    labels = model.fit_predict(X)
    return model, labels


def run_dbscan(X, eps=0.5, min_samples=5):
    model = DBSCAN(eps=eps, min_samples=min_samples)
    labels = model.fit_predict(X)
    return model, labels


def run_optics(X, min_samples=5, xi=0.05, min_cluster_size=0.1):
    model = OPTICS(min_samples=min_samples, xi=xi, min_cluster_size=min_cluster_size)
    labels = model.fit_predict(X)
    return model, labels


# ---------------- Runner ----------------
def run_all_clustering(X, n_clusters=3):
    results = []

    # KMeans
    model, labels = run_kmeans(X, n_clusters=n_clusters)
    scores = evaluate_clustering(X, labels)
    scores["Model"] = "KMeans"
    results.append(scores)

    # Agglomerative
    model, labels = run_agglomerative(X, n_clusters=n_clusters, linkage="ward")
    scores = evaluate_clustering(X, labels)
    scores["Model"] = "Agglomerative (Ward)"
    results.append(scores)

    # DBSCAN
    model, labels = run_dbscan(X, eps=0.5, min_samples=5)
    scores = evaluate_clustering(X, labels)
    scores["Model"] = "DBSCAN"
    results.append(scores)

    # OPTICS
    model, labels = run_optics(X, min_samples=5)
    scores = evaluate_clustering(X, labels)
    scores["Model"] = "OPTICS"
    results.append(scores)

    return pd.DataFrame(results)[["Model", "Silhouette", "DaviesBouldin", "CalinskiHarabasz"]]


# ---------------- Demo ----------------
if __name__ == "__main__":
    # Load dataset
    X, features = load_dataset("iris")
    X_scaled, scaler = scale_data(X)

    # Run all clustering
    results_df = run_all_clustering(X_scaled, n_clusters=3)

    print("\n=== Clustering Evaluation Results ===")
    print(results_df.to_string(index=False))

    # Save to CSV
    results_df.to_csv("clustering_results.csv", index=False)
    print("\nResults saved to clustering_results.csv")



=== Clustering Evaluation Results ===
               Model  Silhouette  DaviesBouldin  CalinskiHarabasz
              KMeans    0.459948       0.833595        241.904402
Agglomerative (Ward)    0.446689       0.803467        222.719164
              DBSCAN    0.356516       7.124057         84.510330
              OPTICS    0.198312       1.365341        112.079219

Results saved to clustering_results.csv
