In [None]:
#PCA (Principal Component Analysis) a way to compress and denoise embeddings by only keep the most important scalars in the vector
#Lets say we have a 48 dimension vector, and we want to compress to 20. The PCA will create 20 new features that are a combination of the original 48

#RUN ENTIRE FILE TO GET UPDATED CLUSTERING

In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import joblib
import os
import numpy as np
import hdbscan
import joblib
import pandas as pd

In [9]:
#Can use the below emb as it includes new images we processed without labels
#But with model accuracy being only 82% it makes mores sense to just use galaxy10 for now
# emb = np.load("artifacts/embeddings/embeddings_all.npy")   #Comes from CreatingEmbeddings.ipynb
emb = np.load("artifacts/embeddings/galaxy10_embeddings.npy")
y   = np.load("artifacts/embeddings/galaxy10_labels.npy")       #Comes from CreatingEmbeddings.ipynb
print(emb.shape, y.shape)


(17736, 48) (17736,)


In [10]:
OUT_MODELS = "artifacts/models"
os.makedirs(OUT_MODELS, exist_ok=True)

scaler = StandardScaler()
Z = scaler.fit_transform(emb)

pca = PCA(n_components=20, random_state=42)
Zp = pca.fit_transform(Z)

joblib.dump(scaler, f"{OUT_MODELS}/scaler.pkl")
joblib.dump(pca, f"{OUT_MODELS}/pca20.pkl")

print("Zp shape:", Zp.shape, "Explained variance:", pca.explained_variance_ratio_.sum())

#What is explained Variance? Explained Variance is how much of the information from the higher dimension was carried over during PCA.
#This is why high variance is good, it tells us that a high amount of information was kept when compressing'
#But even with high variance, it can still contain noise which can be seen in clustering.
#In this case you can try and lower n_components even more

Zp shape: (17736, 20) Explained variance: 0.9819521


In [11]:
clusterer = hdbscan.HDBSCAN(
    min_cluster_size=60,
    min_samples=10,
    metric="euclidean",
    prediction_data=True,
)



cluster_labels = clusterer.fit_predict(Zp)

probs = clusterer.probabilities_   # membership probabilities

#Outlier scores (higher = more outlier-like)
outlier_scores = getattr(clusterer, "outlier_scores_", None)


print("Clusters:", len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0))
print("Noise fraction:", np.mean(cluster_labels == -1))
print("Outlier proxy range:", outlier_scores.min(), outlier_scores.max())

Clusters: 7
Noise fraction: 0.3067207938655841
Outlier proxy range: 0.0 0.6070280465052145


In [24]:
labels = cluster_labels
mask = labels != -1

print("Clustered fraction:", mask.mean()) #what percent of items are confidently assigned to some dense cluster)
print("Mean prob (clustered):", probs[mask].mean()) #Membership score (High score means most points are not borderline)
print("Mean prob (noise):", probs[~mask].mean())


Clustered fraction: 0.643064228367529
Mean prob (clustered): 0.8413363619732568
Mean prob (noise): 0.0


In [25]:
labels = cluster_labels
mask = labels != -1

#map each label to size + mean prob
import numpy as np
for lab in sorted(np.unique(labels[mask])):
    idx = labels == lab
    print(
        f"cluster {lab:>2}: size={idx.sum():>5}, "
        f"mean_prob={probs[idx].mean():.3f}"
    )

print("\ncluster_persistence_ (by label order in HDBSCAN):")
print(clusterer.cluster_persistence_)

#higher persistence means the cluster exists over a wide range of density levels = very stable, likely a real structure
# >0.30 = very strong, 0.10-0.20 = moderate. 0.05-0.10 = weak to moderate


cluster  0: size=  182, mean_prob=0.918
cluster  1: size= 1134, mean_prob=0.738
cluster  2: size= 1293, mean_prob=0.771
cluster  3: size= 5187, mean_prob=0.953
cluster  4: size=  800, mean_prob=0.798
cluster  5: size= 2000, mean_prob=0.672
cluster  6: size=  938, mean_prob=0.832

cluster_persistence_ (by label order in HDBSCAN):
[0.06273125 0.2803693  0.19060551 0.08719951 0.10287783 0.19182774
 0.09107706]


In [26]:
#Add labels to each cluster run each time HDBScan is done
#We only want to cluster using Galaxy10 as they have True Labels

df = pd.read_csv("artifacts/results/galaxy10_clustered.csv")
df[df["source"]=="galaxy10"]

LABEL_NAMES  = [
    "Disturbed", "Merging", "Round Smooth", "In-between Round Smooth", "Cigar",
    "Barred Spiral", "Tight Spiral", "Loose Spiral", "Edge-on (no bulge)", "Edge-on (with bulge)"
]

rows = []
for cid in sorted([c for c in df.cluster_id.unique() if c != -1]):
    sub = df[df.cluster_id == cid]

    # dominant class + purity
    vc = sub.true_label.value_counts(normalize=True)
    dom_label = int(vc.index[0])
    purity = float(vc.iloc[0])

    # top-3 breakdown (nice to keep)
    top3 = sub.true_label.value_counts(normalize=True).head(3)
    top3_str = "; ".join([f"{LABEL_NAMES[int(k)]}:{v:.3f}" for k, v in top3.items()])

    rows.append({
        "cluster_id": int(cid),
        "size": int(len(sub)),
        "dominant_label": dom_label,
        "dominant_name": LABEL_NAMES[dom_label],
        "purity": purity,
        "top3_breakdown": top3_str,
    })

cluster_map_df = pd.DataFrame(rows).sort_values("cluster_id")
cluster_map_df.to_csv("artifacts/results/cluster_map.csv", index=False)

print("Saved artifacts/results/cluster_map.csv")
cluster_map_df



Saved artifacts/results/cluster_map.csv


Unnamed: 0,cluster_id,size,dominant_label,dominant_name,purity,top3_breakdown
0,0,171,4,Cigar,0.947368,Cigar:0.947; In-between Round Smooth:0.018; Ba...
1,1,1221,8,Edge-on (no bulge),0.981982,Edge-on (no bulge):0.982; Edge-on (with bulge)...
2,2,1466,9,Edge-on (with bulge),0.989768,Edge-on (with bulge):0.990; Edge-on (no bulge)...
3,3,5685,7,Loose Spiral,0.343887,Loose Spiral:0.344; Barred Spiral:0.296; Tight...
4,4,884,1,Merging,0.970588,Merging:0.971; Loose Spiral:0.017; In-between ...
5,5,1952,2,Round Smooth,0.944672,Round Smooth:0.945; Tight Spiral:0.028; In-bet...
6,6,917,3,In-between Round Smooth,0.974918,In-between Round Smooth:0.975; Disturbed:0.010...


In [31]:
N0 = 17736                  #Galaxy10 count
N  = len(cluster_labels)
N_new = N - N0

y0 = np.asarray(y).astype(np.int64)
y_all = np.concatenate([y0, np.full(N_new, -1, dtype=np.int64)])

source = np.array(["galaxy10"] * N0 + ["new"] * N_new, dtype=object)

df = pd.DataFrame({
    "row_id": np.arange(N, dtype=np.int32),
    "source": source,
    "true_label": y_all,
    "cluster_id": np.asarray(cluster_labels, dtype=np.int32),
    "membership_prob": np.asarray(probs, dtype=np.float32),
    "outlier_score": (1.0 - np.asarray(probs, dtype=np.float32)),
})

#Path mapping for new rows
meta_new = pd.read_csv("artifacts/results/new_meta_embeddings.csv")

paths = np.array(["<Galaxy10_DECals.h5>"] * N0 + meta_new["stored_path"].astype(str).tolist(), dtype=object)
df["path"] = paths

os.makedirs("artifacts/results", exist_ok=True)
df.to_csv("artifacts/results/combined_clustered.csv", index=False)
print("Saved artifacts/results/combined_clustered.csv", df.shape)

Saved artifacts/results/combined_clustered.csv (17936, 7)


In [30]:
df = pd.read_csv("artifacts/results/combined_clustered.csv")
df[df["source"] == "galaxy10"]

#For each cluster: what label dominates
for cid in sorted([c for c in df.cluster_id.unique() if c != -1]):
    sub = df[df.cluster_id == cid]
    top = sub.true_label.value_counts(normalize=True).head(3)
    print(f"\nCluster {cid} (n={len(sub)}):")
    print(top)


Cluster 0 (n=171):
true_label
4    0.947368
3    0.017544
5    0.017544
Name: proportion, dtype: float64

Cluster 1 (n=1221):
true_label
8    0.981982
9    0.013923
7    0.001638
Name: proportion, dtype: float64

Cluster 2 (n=1466):
true_label
9    0.989768
8    0.008186
7    0.002046
Name: proportion, dtype: float64

Cluster 3 (n=5685):
true_label
7    0.343887
5    0.296394
6    0.243272
Name: proportion, dtype: float64

Cluster 4 (n=884):
true_label
1    0.970588
7    0.016968
3    0.004525
Name: proportion, dtype: float64

Cluster 5 (n=1952):
true_label
2    0.944672
6    0.027664
3    0.009734
Name: proportion, dtype: float64

Cluster 6 (n=917):
true_label
3    0.974918
0    0.009815
1    0.005453
Name: proportion, dtype: float64
