# AutoML: Clustering (K-Means)

This template clusters samples using K-Means.

- Parameters are injected via Papermill.
- Loads dataset via `load_dataset_as_dataframe()`.
- Produces elbow curve + silhouette score + PCA scatter plot.



In [None]:
# Parameters (Papermill)

dataset_id = ""  # UUID string
feature_columns = []  # list[str]; empty means infer numeric columns
n_clusters = 5
max_k = 10
random_state = 42



In [None]:
import numpy as np
import pandas as pd

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

from amprenta_rag.notebook.automl_helpers import load_dataset_as_dataframe, plot_elbow_curve



In [None]:
df = load_dataset_as_dataframe(dataset_id)

if feature_columns:
    X = df[feature_columns]
else:
    X = df.select_dtypes(include=["number"])  # all numeric columns

if X.shape[1] == 0:
    raise ValueError("No numeric feature columns found")

X = X.replace([np.inf, -np.inf], np.nan).dropna(axis=0)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Elbow plot (k=1..max_k)
fig_elbow = plot_elbow_curve(X_scaled, max_k=int(max_k))
fig_elbow



In [None]:
km = KMeans(n_clusters=int(n_clusters), n_init="auto", random_state=int(random_state))
labels = km.fit_predict(X_scaled)

sil = float(silhouette_score(X_scaled, labels))
print("Silhouette score:", sil)

pca = PCA(n_components=2, random_state=int(random_state))
xy = pca.fit_transform(X_scaled)

import matplotlib.pyplot as plt

fig, ax = plt.subplots()
sc = ax.scatter(xy[:, 0], xy[:, 1], c=labels, cmap="tab10", s=20)
ax.set_title("KMeans clusters (PCA 2D)")
ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
fig

