In [None]:
import os, math, shutil, uuid
from datetime import datetime
from pathlib import Path
import numpy as np
import pandas as pd
import tensorflow as tf
import h5py
import matplotlib.pyplot as plt
import joblib
from glob import glob
from tensorflow import keras as K
from sklearn.neighbors import NearestNeighbors
from tensorflow.keras.applications.efficientnet import preprocess_input

In [None]:
#Builds nearest-neighbor index so it can be used later to, transform new embeddings
#find neighbors for new inputs, and use weighted vote to assign a cluster.
#Use this so we don't need to always recreate HDBscan.
#Run once whenever HDBscan is recreated


#Creates reference points
os.makedirs("artifacts/embeddings", exist_ok=True)
os.makedirs("artifacts/models", exist_ok=True)

# Load
emb = np.load("artifacts/embeddings/galaxy10_embeddings.npy").astype(np.float32)
df  = pd.read_csv("artifacts/results/galaxy10_clustered.csv")

scaler = joblib.load("artifacts/models/scaler.pkl")
pca    = joblib.load("artifacts/models/pca20.pkl")  # adjust if yours is named differently

# Transform to clustering space
Z  = scaler.transform(emb)
Zp = pca.transform(Z).astype(np.float32)

# Save reference arrays
np.save("artifacts/embeddings/Zp_ref.npy", Zp)
np.save("artifacts/embeddings/cluster_id_ref.npy", df["cluster_id"].values.astype(np.int32))

# Fit kNN and save
K = 15
knn = NearestNeighbors(n_neighbors=K, metric="euclidean")
knn.fit(Zp)
joblib.dump(knn, "artifacts/models/knn.pkl")

print("Saved:")
print(" - artifacts/embeddings/Zp_ref.npy", Zp.shape)
print(" - artifacts/embeddings/cluster_id_ref.npy", df['cluster_id'].shape)
print(" - artifacts/models/knn.pkl")



Zp = np.load("artifacts/embeddings/Zp_ref.npy").astype(np.float32)
cluster_ids = np.load("artifacts/embeddings/cluster_id_ref.npy").astype(np.int32)
knn = joblib.load("artifacts/models/knn.pkl")

dists, _ = knn.kneighbors(Zp)

# mean distance to neighbors 2..K (skip 0th self-distance)
mean_knn_dist = dists[:, 1:].mean(axis=1)

# Only use non-noise points to set a threshold
base = mean_knn_dist[cluster_ids != -1]

thr = float(np.percentile(base, 99))  # try 97–99 for stricter/looser
np.save("artifacts/models/knn_dist_threshold.npy", np.array([thr], dtype=np.float32))

print("Saved artifacts/models/knn_dist_threshold.npy =", thr)



In [None]:
#Computing centroids, and distance distrubutions from reference set, which is used to score new points
#Run once whenever HDBscan is recreated

Zp_ref = np.load("artifacts/embeddings/Zp_ref.npy").astype(np.float32)
cluster_ref = np.load("artifacts/embeddings/cluster_id_ref.npy").astype(np.int32)

#centroids and per-cluster distance distributions
centroids = {}
dist_dists = {}

for cid in sorted(set(cluster_ref)):
    if cid == -1:
        continue
    pts = Zp_ref[cluster_ref == cid]
    c = pts.mean(axis=0)
    centroids[cid] = c

    d = np.linalg.norm(pts - c, axis=1)
    dist_dists[cid] = d

#Save for later use
np.save("artifacts/models/cluster_centroids.npy", centroids, allow_pickle=True)
np.save("artifacts/models/cluster_centroid_dists.npy", dist_dists, allow_pickle=True)

print("Saved centroids + distance distributions.")


In [None]:
#Load necessary files and helper methods to add new images
#Run when adding new images

knn = joblib.load("artifacts/models/knn.pkl")
cluster_ref = np.load("artifacts/embeddings/cluster_id_ref.npy").astype(np.int32)
thr = float(np.load("artifacts/models/knn_dist_threshold.npy")[0])
centroids = np.load("artifacts/models/cluster_centroids.npy", allow_pickle=True).item()
dist_dists = np.load("artifacts/models/cluster_centroid_dists.npy", allow_pickle=True).item()



# Cluster meaning map (dominant Galaxy10 class per cluster)
cluster_map = pd.read_csv("artifacts/results/cluster_map.csv").set_index("cluster_id")

def assign_cluster_knn(zp_new, vote_min=0.6):
    """
    zp_new: (d,) float32 point in PCA space
    Returns a dict with cluster_id, name, confidence, distance, anomaly flag.
    """
    zp_new = np.asarray(zp_new, dtype=np.float32).reshape(1, -1)
    dists, idxs = knn.kneighbors(zp_new)

    neigh = idxs[0]
    neigh_clusters = cluster_ref[neigh]

    mean_dist = float(dists[0].mean())

    # vote ignoring noise neighbors
    valid = neigh_clusters[neigh_clusters != -1]
    if len(valid) == 0:
        return {
            "assigned_cluster_id": -1,
            "cluster_name": "Noise/Unknown",
            "vote_conf": 0.0,
            "mean_knn_dist": mean_dist,
            "is_anomaly": True,
        }

    vals, counts = np.unique(valid, return_counts=True)
    best = int(vals[np.argmax(counts)])
    vote_conf = float(np.max(counts) / len(valid))


    # distance to centroid + percentile
    if best in centroids:
        c = centroids[best]
        dist_to_center = float(np.linalg.norm(zp_new.reshape(-1) - c))
    
        ref_d = dist_dists[best]
        # percentile: fraction of reference points farther than this (higher = more central)
        centrality = float((ref_d > dist_to_center).mean())  # 0..1
    else:
        dist_to_center = float("nan")
        centrality = float("nan")



    
    # anomaly decision: far OR weak vote
    is_anom = (mean_dist > thr) or (vote_conf < vote_min)

    # map to human meaning
    if best in cluster_map.index:
        name = str(cluster_map.loc[best, "dominant_name"])
    else:
        name = f"Cluster {best}"

        

    return {
        "assigned_cluster_id": best,
        "cluster_name": name,
        "vote_conf": vote_conf,
        "mean_knn_dist": mean_dist,
        "dist_to_center": dist_to_center,
        "center_centrality": centrality,  # e.g. 0.82 means "more central than 82% of cluster"
        "is_anomaly": bool(is_anom),
    }


In [None]:
#Run to find anomalies within galaxy10_clustered.csv and save png with id
#Galaxy10_clustered.csv only contains galaxy10decals dataset
#As model improves less anamolies show up

H5_PATH  = "images/Galaxy10_DECals.h5"
CSV_PATH = "artifacts/results/galaxy10_clustered.csv"

OUT_DIR = "reports/anomalies_pages/cluster_grids"
os.makedirs(OUT_DIR, exist_ok=True)

COLS = 8
PER_PAGE = 64

# Define what counts as an "HDBSCAN anomaly"
INCLUDE_NOISE = True                 #cluster_id == -1
LOWP_PERCENTILE = 5                  #bottom 5% membership_prob (non-noise)

LOWP_CUTOFF = None                   


def load_images_by_indices(idxs):
    """h5py fancy indexing needs increasing order; return images in original order."""
    idxs = np.asarray(idxs, dtype=np.int64)
    order = np.argsort(idxs)
    idxs_sorted = idxs[order]
    with h5py.File(H5_PATH, "r") as f:
        X = f["images"]
        imgs_sorted = X[idxs_sorted]
    inv = np.argsort(order)
    return imgs_sorted[inv]

def save_grid(imgs, rows_df, out_path, title):
    n = len(imgs)
    rows = math.ceil(n / COLS)
    plt.figure(figsize=(COLS*2, rows*2))

    for i in range(n):
        ax = plt.subplot(rows, COLS, i + 1)
        ax.imshow(imgs[i])
        ax.axis("off")

        r = rows_df.iloc[i]
        idx  = int(r["idx"])
        y    = int(r["true_label"])
        cid  = int(r["cluster_id"])
        prob = float(r["membership_prob"])

        ax.set_title(f"id={idx}\ny={y} c={cid}\np={prob:.2f}", fontsize=8)

    plt.suptitle(title, y=1.01, fontsize=14)
    plt.tight_layout()
    plt.savefig(out_path, dpi=200, bbox_inches="tight")
    plt.close()
    print("Saved", out_path)


#Load clustering results
df = pd.read_csv(CSV_PATH)


#Build all current anomalies in HDBSCAN
#Noise points are always anomalies
#Plus low-membership points among clustered samples
anoms = []

if INCLUDE_NOISE:
    noise = df[df["cluster_id"] == -1].copy()
    noise["anom_reason"] = "noise"
    noise["anom_score"] = 2.0 + (1.0 - noise["membership_prob"])  # prioritize noise
    anoms.append(noise)

non_noise = df[df["cluster_id"] != -1].copy()

if LOWP_CUTOFF is not None:
    lowp = non_noise[non_noise["membership_prob"] < float(LOWP_CUTOFF)].copy()
    lowp["anom_reason"] = f"low_prob<{LOWP_CUTOFF}"
else:
    #percentile-based cutoff
    cutoff = float(np.percentile(non_noise["membership_prob"], LOWP_PERCENTILE))
    lowp = non_noise[non_noise["membership_prob"] <= cutoff].copy()
    lowp["anom_reason"] = f"low_prob<=p{LOWP_PERCENTILE}({cutoff:.4f})"

lowp["anom_score"] = (1.0 - lowp["membership_prob"])
anoms.append(lowp)

anom_df = pd.concat(anoms, ignore_index=True)

#Sort anomalies: strongest first
anom_df = anom_df.sort_values(["anom_score", "membership_prob"], ascending=[False, True]).reset_index(drop=True)

#Save anomaly list CSV
anom_csv = "artifacts/results/hdbscan_anomalies_all.csv"
os.makedirs("artifacts/results", exist_ok=True)
anom_df[["idx","true_label","cluster_id","membership_prob","anom_reason","anom_score"]].to_csv(anom_csv, index=False)

print("\nTotal anomalies saved:", len(anom_df))
print("Saved anomaly table:", anom_csv)
print("Output images folder:", OUT_DIR)

#Save anamolies images
total = len(anom_df)
if total == 0:
    print("No anomalies found with current settings.")
else:
    for start in range(0, total, PER_PAGE):
        end = min(start + PER_PAGE, total)
        page = anom_df.iloc[start:end].copy().reset_index(drop=True)

        imgs = load_images_by_indices(page["idx"].values)

        page_num = start // PER_PAGE + 1
        out_path = os.path.join(OUT_DIR, f"anomalies_page_{page_num:02d}_{start}_{end-1}.png")

        title = f"HDBSCAN anomalies (page {page_num}) — {start}..{end-1} of {total}"
        save_grid(imgs, page, out_path, title)


In [None]:
#Helper methods for kNN and loading transforms, encoder, cluster map


@K.utils.register_keras_serializable()
class CastToFloat16(K.layers.Layer):
    def call(self, x):
        x = tf.cast(x, tf.float16)
        return preprocess_input(x)

    def compute_output_shape(self, input_shape):
        return input_shape




INCOMING_DIR = Path("data/incoming/images")
ANOM_DIR     = Path("data/anomalies/images")
INCOMING_DIR.mkdir(parents=True, exist_ok=True)
ANOM_DIR.mkdir(parents=True, exist_ok=True)

LOG_PATH = Path("artifacts/results/new_inference.csv")
LOG_PATH.parent.mkdir(parents=True, exist_ok=True)

#Load transforms
scaler = joblib.load("artifacts/models/scaler.pkl")
pca    = joblib.load("artifacts/models/pca20.pkl")

#Load classifier -> encoder
clf = tf.keras.models.load_model("best_model/galaxy_b3_final_BEST_100TP.keras", compile=False)
encoder = tf.keras.Model(clf.input, clf.layers[-2].output)

#Load cluster meaning map
cluster_map = pd.read_csv("artifacts/results/cluster_map.csv").set_index("cluster_id")



def preprocess_image(path):
    img = tf.io.read_file(path)
    img = tf.image.decode_image(img, channels=3, expand_animations=False)
    img = tf.image.resize(img, (256, 256), method="bilinear")
    img = tf.cast(img, tf.float32)  # 0..255
    return tf.expand_dims(img, 0)

def infer_and_route(image_path, vote_min=0.6):
    """
    Runs inference, assigns cluster via kNN, routes image into incoming/anomalies dataset,
    and appends a row to artifacts/results/new_inference.csv.
    """
    image_path = Path(image_path)
    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
    uid = uuid.uuid4().hex[:10]
    ext = image_path.suffix.lower() if image_path.suffix else ".png"

    #Embed
    x = preprocess_image(str(image_path))
    e = encoder(x, training=False).numpy().astype(np.float32)  # (1,48)

    #Project to clustering space
    zp = pca.transform(scaler.transform(e)).astype(np.float32).reshape(-1)

    #Assign cluster + anomaly (uses assign_cluster_knn)
    out = assign_cluster_knn(zp, vote_min=vote_min)  

    cid = int(out["assigned_cluster_id"])
    is_anom = bool(out["is_anomaly"])

    #Map to labels
    if cid in cluster_map.index:
        cluster_name = str(cluster_map.loc[cid, "dominant_name"])
        cluster_purity = float(cluster_map.loc[cid, "purity"])
    else:
        cluster_name = "Noise/Unknown" if cid == -1 else f"Cluster {cid}"
        cluster_purity = np.nan


    #Route file
    dest_dir = ANOM_DIR if is_anom else INCOMING_DIR
    
    orig_name = image_path.name          
    dest_path = dest_dir / orig_name
    
    #avoid overwriting if already routed
    if not dest_path.exists():
        shutil.copy2(image_path, dest_path)


    row = {
        "timestamp": ts,
        "source_path": str(image_path),
        "stored_path": str(dest_path),
        "assigned_cluster_id": cid,
        "cluster_name": cluster_name,
        "cluster_purity": cluster_purity,
        "vote_conf": float(out["vote_conf"]),
        "mean_knn_dist": float(out["mean_knn_dist"]),
        "is_anomaly": is_anom,
    }

    #Append to CSV
    if LOG_PATH.exists():
        pd.DataFrame([row]).to_csv(LOG_PATH, mode="a", header=False, index=False)
    else:
        pd.DataFrame([row]).to_csv(LOG_PATH, index=False)

    return row

# result = infer_and_route(test_path)

# print(
#     f"cluster_id={result['assigned_cluster_id']} | "
#     f"name={result['cluster_name']} | "
#     f"anom={result['is_anomaly']} | "
#     f"conf={result['vote_conf']:.2f} | "
#     f"dist={result['mean_knn_dist']:.4f}"
# )
# print("Saved to:", result["stored_path"])

In [None]:
for i, layer in enumerate(clf.layers[:5]):
    print(i, layer.name, layer.__class__.__name__)


In [None]:
#Use kNN to run new data instead of recreating HDBscan every time

paths = sorted(glob("data/new_drop/*.jpg")) + sorted(glob("data/new_drop/*.png")) + sorted(glob("data/new_drop/*.jpeg"))
print("Found images:", len(paths))

results = []
fails = []

for p in paths:
    try:
        row = infer_and_route(p) 
        results.append(row)
        print(f"{p.split('/')[-1]} -> {row['cluster_name']} | anom={row['is_anomaly']} | conf={row['vote_conf']:.2f}")
    except Exception as e:
        fails.append({"path": p, "error": str(e)})
        print("FAIL", p, e)

#Save a snapshot of run
pd.DataFrame(results).to_csv("artifacts/results/new_inference_last_run.csv", index=False)
pd.DataFrame(fails).to_csv("artifacts/results/new_inference_failures.csv", index=False)

print("\nDone.")
print("Processed:", len(results))
print("Failed:", len(fails))

if results:
    r = pd.DataFrame(results)
    print("\nCluster counts:")
    print(r["cluster_name"].value_counts())
    print("\nAnomalies:", int(r["is_anomaly"].sum()), "out of", len(r))
