In [2]:
import os
import time
import h5py
import numpy as np
import tensorflow as tf
from tensorflow import keras as K
from tensorflow.keras.applications.efficientnet import preprocess_input

In [3]:
H5_PATH    = "images/Galaxy10_DECals.h5"          
MODEL_PATH = "best_model/galaxy_b3_final_BEST_100TP.keras" #Classifier Model
OUT_DIR    = "artifacts/embeddings"
os.makedirs(OUT_DIR, exist_ok=True)

In [4]:
@K.utils.register_keras_serializable()
class CastToFloat16(K.layers.Layer):
    def call(self, x):
        x = tf.cast(x, tf.float16)
        return preprocess_input(x)

    def compute_output_shape(self, input_shape):
        return input_shape


#Custom layers must be recreated when loading model
clf = tf.keras.models.load_model(MODEL_PATH, compile=False)

I0000 00:00:1768959654.409544    4193 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 9513 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4070 SUPER, pci bus id: 0000:01:00.0, compute capability: 8.9
2026-01-20 17:40:54.695023: E tensorflow/core/util/util.cc:131] oneDNN supports DT_HALF only on platforms with AVX-512. Falling back to the default Eigen-based implementation if present.


In [5]:
# Pick an embedding output:
embed_output = clf.layers[-2].output  #usually the layer right before the final Dense/Softmax
encoder = tf.keras.Model(inputs=clf.input, outputs=embed_output)

In [6]:
TARGET_SIZE = (256, 256)
BATCH = 48

def preprocess_batch(x_uint8):
    x = tf.cast(x_uint8, tf.float16)                  
    x = tf.image.resize(x, TARGET_SIZE, method="bilinear")

    return x

In [7]:
with h5py.File(H5_PATH, "r") as f:
    X = f["images"]  
    y = f["ans"]      
    N = X.shape[0]

    #infer embedding dim
    d = int(encoder(preprocess_batch(X[0:1]), training=False).shape[-1])

    emb_path = os.path.join(OUT_DIR, "galaxy10_embeddings.dat")
    embeddings = np.memmap(emb_path, dtype="float32", mode="w+", shape=(N, d))
    labels = np.zeros((N,), dtype=np.int64)

    for i in range(0, N, BATCH):
        xb = X[i:i+BATCH]
        yb = y[i:i+BATCH]

        xb = preprocess_batch(xb)
        eb = encoder(xb, training=False).numpy().astype("float32")

        embeddings[i:i+len(eb)] = eb
        labels[i:i+len(yb)] = yb

    embeddings.flush()

In [8]:
emb_np = np.memmap(emb_path, dtype="float32", mode="r").reshape(-1, d)
np.save(os.path.join(OUT_DIR, "galaxy10_embeddings.npy"), np.array(emb_np))
np.save(os.path.join(OUT_DIR, "galaxy10_labels.npy"), labels)

print("Saved:")
print(" - artifacts/embeddings/galaxy10_embeddings.npy", emb_np.shape)
print(" - artifacts/embeddings/galaxy10_labels.npy", labels.shape)

Saved:
 - artifacts/embeddings/galaxy10_embeddings.npy (17736, 48)
 - artifacts/embeddings/galaxy10_labels.npy (17736,)


In [None]:
import numpy as np
emb = np.load("artifacts/embeddings/galaxy10_embeddings.npy")
print("shape:", emb.shape, "dtype:", emb.dtype)
print("any NaNs:", np.isnan(emb).any())
print("mean/std:", emb.mean(), emb.std())

# mean = avg value of all embeddings dimensions across all samples
# std = global spread (scale) of all embedding values
# std less than 0.1 = collapsed / low-information embeddings
# std greater than 5 or 10, unstable scale, distance explodes
# 0.3-1.5 usually healthy
# shape (17736, 48), 17736 = total images
# 48 = 48 dimensional embedding vectors, so 48 numbers represent 1 galaxy 
# when clustering it compares each vector to one another to determine where it should go

In [9]:
#RUN TO CREATE EMBEDDINGS FOR NEW INPUTS ONLY
#Need to run above cells if model has been changed

import os
import numpy as np
import pandas as pd
import tensorflow as tf


MANIFEST_CSV  = "data/new_drop/manifest.csv"                 # downloaded cutouts ledger
INFERENCE_CSV = "artifacts/results/new_inference.csv"        # already processed by your pipeline

MODEL_PATH = "best_model/galaxy_b3_final_BEST_100TP.keras"
OUT_DIR = "artifacts/embeddings"
META_OUT = "artifacts/results/new_meta_embeddings.csv"

os.makedirs(OUT_DIR, exist_ok=True)
os.makedirs(os.path.dirname(META_OUT), exist_ok=True)

#Load model + encoder
clf = tf.keras.models.load_model(MODEL_PATH, compile=False)
encoder = tf.keras.Model(clf.input, clf.layers[-2].output)

#Infer target size from model input
inp = encoder.input_shape
TARGET_SIZE = (int(inp[1]), int(inp[2]))
print("Model TARGET_SIZE:", TARGET_SIZE)

@tf.function
def preprocess_path(path):
    img_bytes = tf.io.read_file(path)
    img = tf.image.decode_jpeg(img_bytes, channels=3)
    img = tf.image.resize(img, TARGET_SIZE, method="bilinear")
    img = tf.cast(img, tf.float32)  # keep 0..255
    return img


#Load manifest paths (downloaded)
m = pd.read_csv(MANIFEST_CSV)
if "path" not in m.columns:
    raise ValueError(f"{MANIFEST_CSV} must contain a 'path' column. Found: {list(m.columns)}")

m["path"] = m["path"].astype(str)

#Keep only downloads taht were successful
if "status" in m.columns:
    ok_mask = m["status"].astype(str).isin(["ok", "exists"])
    m = m[ok_mask].copy()

#Grab only unique paths
manifest_paths = m["path"].dropna().unique().tolist()
#Make sure paths exist
manifest_paths = [p for p in manifest_paths if os.path.exists(p)]

print("Manifest ok/exists files found:", len(manifest_paths))


#Load already processed paths 
processed_paths = set()
if os.path.exists(INFERENCE_CSV) and os.path.getsize(INFERENCE_CSV) > 0:
    inf = pd.read_csv(INFERENCE_CSV)
    if "stored_path" not in inf.columns:
        raise ValueError(f"{INFERENCE_CSV} must contain 'stored_path'. Found: {list(inf.columns)}")
    processed_paths = set(inf["stored_path"].astype(str).dropna().unique().tolist())

print("Already processed paths in new_inference:", len(processed_paths))


#Compute: manifest - inference
processed_ids = {os.path.basename(p) for p in processed_paths}
to_embed = [p for p in manifest_paths if os.path.basename(p) not in processed_ids]

print("To embed (manifest - inference):", len(to_embed))

if len(to_embed) == 0:
    print("Nothing new to embed. Exiting.")
    raise SystemExit

M = len(to_embed)

#infer embedding dim
d = int(encoder(tf.zeros((1, *TARGET_SIZE, 3), dtype=tf.float32), training=False).shape[-1])
print("Embedding dim:", d)


#Embed in batches
emb = np.zeros((M, d), dtype=np.float32)

for i in range(0, M, BATCH):
    batch_paths = to_embed[i:i+BATCH]
    xb = tf.stack([preprocess_path(pth) for pth in batch_paths], axis=0)
    eb = encoder(xb, training=False).numpy().astype("float32")
    emb[i:i+len(eb)] = eb

    if (i // BATCH) % 10 == 0:
        print(f"embedded {min(i+BATCH, M)}/{M}")

labels = np.full((M,), -1, dtype=np.int64)


#Save outputs
np.save(os.path.join(OUT_DIR, "new_embeddings.npy"), emb)
np.save(os.path.join(OUT_DIR, "new_labels.npy"), labels)

meta = pd.DataFrame({"stored_path": to_embed})
meta.to_csv(META_OUT, index=False)

print("Saved:")
print(" - artifacts/embeddings/new_embeddings.npy", emb.shape)
print(" - artifacts/embeddings/new_labels.npy", labels.shape)
print(" -", META_OUT)


Model TARGET_SIZE: (256, 256)
Manifest ok/exists files found: 200
Already processed paths in new_inference: 400
To embed (manifest - inference): 200


2026-01-20 17:48:31.971814: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:473] Loaded cuDNN version 91002


Embedding dim: 48
embedded 48/200
Saved:
 - artifacts/embeddings/new_embeddings.npy (200, 48)
 - artifacts/embeddings/new_labels.npy (200,)
 - artifacts/results/new_meta_embeddings.csv


In [10]:
import os
import numpy as np
import pandas as pd

OUT_DIR = "artifacts/embeddings"
os.makedirs(OUT_DIR, exist_ok=True)

ALL_EMB = os.path.join(OUT_DIR, "embeddings_all.npy")
ALL_LAB = os.path.join(OUT_DIR, "labels_all.npy")
ALL_META = "artifacts/results/embeddings_all_meta.csv"

#Load base galaxy10 
emb_g10 = np.load("artifacts/embeddings/galaxy10_embeddings.npy").astype(np.float32)
y_g10   = np.load("artifacts/embeddings/galaxy10_labels.npy").astype(np.int64)
N0 = emb_g10.shape[0]

#Load this run new
emb_new = np.load("artifacts/embeddings/new_embeddings.npy").astype(np.float32)
meta_new = pd.read_csv("artifacts/results/new_meta_embeddings.csv")
assert len(meta_new) == emb_new.shape[0], "new_meta_embeddings.csv must match new_embeddings rows"

#Use basename as identity 
new_ids = meta_new["stored_path"].astype(str).map(os.path.basename).tolist()

# If all files exist, load them; otherwise initialize from g10
if os.path.exists(ALL_EMB) and os.path.exists(ALL_LAB) and os.path.exists(ALL_META):
    emb_all = np.load(ALL_EMB).astype(np.float32)
    y_all   = np.load(ALL_LAB).astype(np.int64)
    meta_all = pd.read_csv(ALL_META)

    #builds set of IDs that are already in enbeddings_all_meta
    existing_new_ids = set(
        meta_all[meta_all["source"] == "new"]["path"].astype(str).map(os.path.basename).tolist()
    )
else:
    emb_all = emb_g10
    y_all   = y_g10
    meta_all = pd.DataFrame({
        "row_id": np.arange(N0),
        "source": ["galaxy10"] * N0,
        "source_idx": list(range(N0)),
        "path": ["<Galaxy10_DECals.h5>"] * N0,
        "true_label": y_g10,
    })
    existing_new_ids = set()


#Keeps rows we have not processed before
keep_mask = np.array([bid not in existing_new_ids for bid in new_ids], dtype=bool)
k = int(keep_mask.sum())
print(f"New embeddings this run: {len(new_ids)}")
print(f"Unseen new embeddings to append: {k}")

if k == 0:
    print("Nothing new to append (all already present in embeddings_all).")
else:

    emb_to_add = emb_new[keep_mask]
    y_to_add = np.full((k,), -1, dtype=np.int64)
    paths_to_add = meta_new["stored_path"].astype(str).tolist()
    paths_to_add = [p for p, keep in zip(paths_to_add, keep_mask) if keep]
    
    #Append arrays
    emb_all = np.concatenate([emb_all, emb_to_add], axis=0)
    y_all   = np.concatenate([y_all, y_to_add], axis=0)
    
    #Append meta rows
    start = len(meta_all)
    new_meta_rows = pd.DataFrame({
        "row_id": np.arange(start, start + k),
        "source": ["new"] * k,
        "source_idx": list(range(k)),  # index within this appended chunk (fine)
        "path": paths_to_add,
        "true_label": y_to_add,
    })
    meta_all = pd.concat([meta_all, new_meta_rows], ignore_index=True)
    
    #Save
    np.save(ALL_EMB, emb_all)
    np.save(ALL_LAB, y_all)
    meta_all.to_csv(ALL_META, index=False)
    
    print("Saved:")
    print(" -", ALL_EMB, emb_all.shape)
    print(" -", ALL_LAB, y_all.shape)
    print(" -", ALL_META, meta_all.shape)


New embeddings this run: 200
Unseen new embeddings to append: 200
Saved:
 - artifacts/embeddings/embeddings_all.npy (18136, 48)
 - artifacts/embeddings/labels_all.npy (18136,)
 - artifacts/results/embeddings_all_meta.csv (18136, 5)
