# 05 – Unsupervised Learning (Clustering)

Approach:
- KMeans with Elbow + Silhouette
- Hierarchical Agglomerative (dendrogram)
- Compare clusters vs target (purity, adjusted rand index).

In [None]:
import pandas as pd, numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score, adjusted_rand_score
import scipy.cluster.hierarchy as sch
import matplotlib.pyplot as plt

df = pd.read_csv(Path('../data/heart_disease.csv'))
target_col = 'target' if 'target' in df.columns else 'num'
y = df[target_col]
X = df.drop(columns=[target_col])
categorical = [c for c in X.columns if X[c].dtype=='object']
numeric = [c for c in X.columns if c not in categorical]

numeric_transformer = Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])
categorical_transformer = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('ohe', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer([('num', numeric_transformer, numeric), ('cat', categorical_transformer, categorical)])
X_enc = preprocessor.fit_transform(X)

# KMeans Elbow
inertias = []
K_range = range(2,11)
for k in K_range:
    km = KMeans(n_clusters=k, random_state=42, n_init=10) 
    km.fit(X_enc)
    inertias.append(km.inertia_)
plt.plot(list(K_range), inertias, marker='o'); plt.xlabel('k'); plt.ylabel('Inertia'); plt.title('KMeans Elbow'); plt.show()

## 1. Silhouette Scores

In [None]:
sil_scores = []
for k in K_range:
    km = KMeans(n_clusters=k, random_state=42, n_init=10).fit(X_enc)
    sil = silhouette_score(X_enc, km.labels_)
    sil_scores.append(sil)
plt.plot(list(K_range), sil_scores, marker='o'); plt.xlabel('k'); plt.ylabel('Silhouette'); plt.title('Silhouette vs k'); plt.show()

## 2. Fit Chosen K & Compare to True Labels

In [None]:
best_k = 2  # often heart disease dataset approximates binary separation
kmeans = KMeans(n_clusters=best_k, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_enc)
from collections import Counter
Counter(clusters)
from sklearn.metrics import confusion_matrix
import numpy as np
cm = confusion_matrix(y, clusters)  # order may not align
cm

## 3. Adjusted Rand Index

In [None]:
ari = adjusted_rand_score(y, clusters)
ari

## 4. Hierarchical Clustering Dendrogram

In [None]:
# For dendrogram, sample to limit size if necessary
sample_idx = np.random.choice(range(X_enc.shape[0]), size=min(300, X_enc.shape[0]), replace=False)
X_sample = X_enc[sample_idx]
Z = sch.linkage(X_sample.todense() if hasattr(X_sample,'todense') else X_sample, method='ward')
plt.figure(figsize=(10,5))
sch.dendrogram(Z, truncate_mode='level', p=4) 
plt.title('Hierarchical Clustering Dendrogram (truncated)')
plt.show()

## Notes
- Clusters rarely align perfectly with diagnostic label; ARI quantifies agreement.
- Standardize features—mixtures of scales can dominate distance.
- Consider dimensionality reduction (PCA) before clustering to reduce noise.