In [39]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, confusion_matrix

import tensorflow as tf
from tensorflow.keras import layers, models, optimizers
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint



In [None]:
# 1) LOAD & FILTER gz_decals_volunteers_5.csv (keep only galaxies with ≥5 total votes)
csv_path = "gz_decals_volunteers_5.csv"
df = pd.read_csv(csv_path)

# Identify all columns in the CSV
count_cols = [c for c in df.columns if c.endswith("_total-votes")]

# Sum them to get total votes per galaxy
df["total_votes"] = df[count_cols].sum(axis=1)
# Keep only galaxies with at least 5 votes
df_filtered = df[df["total_votes"] >= 5].copy()
print(f"Original rows in volunteers_5.csv: {len(df):,}")
print(f"After ≥5-vote filter: {len(df_filtered):,} galaxies\n")

# Build the “filename” column = iauname + ".png"
df_filtered["filename"] = df_filtered["iauname"].astype(str) + ".png"

Original rows in volunteers_5.csv: 253,286
After ≥5-vote filter: 251,824 galaxies



In [None]:
# 2) Walk through gz_decals_dr5_png_part1 to collect (filename → full image_path)
root = "gz_decals_dr5_png_part1"
all_rows = []
for subdir in os.listdir(root):
    folder = os.path.join(root, subdir)
    if not os.path.isdir(folder):
        continue
    for fname in os.listdir(folder):
        if not fname.lower().endswith(".png"):
            continue
        fullpath = os.path.join(folder, fname)
        all_rows.append({
            "filename": fname,
            "image_path": fullpath
        })

imgs_df = pd.DataFrame(all_rows)
print(f"Total PNGs found on disk: {len(imgs_df):,}\n")


Total PNGs found on disk: 91,481



In [None]:
# 3) Merge imgs_df with df_filtered (only ≥5‐vote galaxies get a row; others become NaN)
merged = imgs_df.merge(df_filtered, on="filename", how="left")
matched = merged["iauname"].notna().sum()
print(f"After merge: {matched:,} / {len(imgs_df):,} images have ≥5 votes in volunteers_5.csv\n")


After merge: 71,503 / 91,481 images have ≥5 votes in volunteers_5.csv



In [None]:
# 4) ASSIGN “label” (elliptical/spiral/irregular) WITH THRESHOLD = 0.5
#    Any row that is NaN for iauname becomes “unlabeled” and will be dropped.
def decide_label(row):
    # If no volunteer data (iauname is NaN), label as "unlabeled"
    if pd.isna(row["iauname"]):
        return "unlabeled"
    # 1) If smooth_fraction ≥ 0.5 → elliptical
    if row["smooth-or-featured_smooth_fraction"] >= 0.5:
        return "elliptical"
    # 2) Else if spiral_fraction ≥ 0.5 → spiral
    if row["has-spiral-arms_yes_fraction"] >= 0.5:
        return "spiral"
    # 3) Otherwise → irregular
    return "irregular"

merged["label"] = merged.apply(decide_label, axis=1)
print("Label distribution (including ‘unlabeled’):\n", merged["label"].value_counts(dropna=False), "\n")

# Drop any “unlabeled” rows (these either had <5 votes or didn’t appear in volunteers_5.csv at all)
merged = merged[merged["label"] != "unlabeled"].reset_index(drop=True)
print(f"After dropping unlabeled, we have {len(merged):,} images with a final label.\n")


Label distribution (including ‘unlabeled’):
 label
elliptical    44447
unlabeled     19978
spiral        16102
irregular     10954
Name: count, dtype: int64 

After dropping unlabeled, we have 71,503 images with a final label.



In [None]:
# 6) Import validation set from validation_galaxies.csv and remove that from "merged"

val_csv_path = "validation_galaxies.csv"
df_val_in = pd.read_csv(val_csv_path)


df_val_in["filename"] = df_val_in["iauname"].astype(str) + ".png"

# Pick exactly those rows from 'merged' whose filename is in df_val_in
df_val = merged.merge(
    df_val_in[["filename"]],
    on="filename",
    how="inner"
).copy()

# Remove the validation rows from 'merged' to form the pool for train+test
df_train_test = merged[~merged["filename"].isin(df_val["filename"])].reset_index(drop=True)
print(f"After removing validation, df_train_test has {len(df_train_test):,} rows.")



After removing validation, df_train_test has 67,745 rows.


In [None]:
# 7) SPLIT df_train_test INTO train and test (e.g. 80% train / 20% test), STRATIFIED

val_csv_path = "validation_galaxies.csv"
df_val_in = pd.read_csv(val_csv_path)

# Create a 'filename' column so it matches merged['filename'] = iauname + ".png"
df_val_in["filename"] = df_val_in["iauname"].astype(str) + ".png"

# Now pick those rows from 'merged' whose filename is in df_val_in
df_val = merged.merge(
    df_val_in[["filename"]],
    on="filename",
    how="inner"
).copy()

# Remove the validation rows from merged to form the pool for train+test
df_train_test = merged[~merged["filename"].isin(df_val["filename"])].reset_index(drop=True)
print(f"After removing validation, df_train_test has {len(df_train_test):,} rows.")


df_train, df_test = train_test_split(
    df_train_test,
    test_size=0.20,                   
    stratify=df_train_test["label"],
    random_state=42
)

print("Sizes after splitting train/test (on the remaining pool):")
print(f"  Train: {len(df_train):,}")
print(f"  Test:  {len(df_test):,}\n")

print("Train label distribution:\n", df_train["label"].value_counts(normalize=True))
print(" Test label distribution:\n", df_test["label"].value_counts(normalize=True), "\n")


After removing validation, df_train_test has 67,745 rows.
Sizes after splitting train/test (on the remaining pool):
  Train: 54,196
  Test:  13,549

Train label distribution:
 label
elliptical    0.622721
spiral        0.222323
irregular     0.154956
Name: proportion, dtype: float64
 Test label distribution:
 label
elliptical    0.622703
spiral        0.222304
irregular     0.154993
Name: proportion, dtype: float64 



In [None]:
# 8) SET UP KERAS ImageDataGenerators for TRAIN / VAL / TEST
train_datagen = ImageDataGenerator(
    rescale=1.0/255,
    rotation_range=360,
    width_shift_range=0.1,
    height_shift_range=0.1,
    zoom_range=0.1,
    horizontal_flip=True,
    vertical_flip=True
)
val_datagen = ImageDataGenerator(rescale=1.0/255)

train_gen = train_datagen.flow_from_dataframe(
    dataframe=df_train,
    x_col="image_path",
    y_col="label",
    target_size=(224, 224),
    batch_size=32,
    class_mode="categorical"
)
val_gen = val_datagen.flow_from_dataframe(
    dataframe=df_val,
    x_col="image_path",
    y_col="label",
    target_size=(224, 224),
    batch_size=32,
    class_mode="categorical"
)
test_gen = val_datagen.flow_from_dataframe(
    dataframe=df_test,
    x_col="image_path",
    y_col="label",
    target_size=(224, 224),
    batch_size=32,
    class_mode="categorical",
    shuffle=False
)

Found 54196 validated image filenames belonging to 3 classes.
Found 3758 validated image filenames belonging to 3 classes.
Found 13549 validated image filenames belonging to 3 classes.


In [None]:
# 9) BUILD A SIMPLE CNN 
def build_simple_cnn(input_shape=(224, 224, 3), num_classes=3):
    model = models.Sequential([
        layers.Conv2D(32, (3, 3), activation="relu", input_shape=input_shape),
        layers.BatchNormalization(),
        layers.MaxPooling2D((2, 2)),
        layers.Dropout(0.25),

        layers.Conv2D(64, (3, 3), activation="relu"),
        layers.BatchNormalization(),
        layers.MaxPooling2D((2, 2)),
        layers.Dropout(0.25),

        layers.Conv2D(128, (3, 3), activation="relu"),
        layers.BatchNormalization(),
        layers.MaxPooling2D((2, 2)),
        layers.Dropout(0.25),

        layers.Flatten(),
        layers.Dense(256, activation="relu"),
        layers.BatchNormalization(),
        layers.Dropout(0.5),

        layers.Dense(num_classes, activation="softmax")
    ])
    return model

model = build_simple_cnn(input_shape=(224, 224, 3), num_classes=3)
model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
# 10) Compile and train the model
model.compile(
    optimizer=optimizers.Adam(learning_rate=1e-4),
    loss="categorical_crossentropy",
    metrics=["accuracy"]
)

early_stop = EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)
lr_reduce = ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=3, min_lr=1e-7)
checkpoint = ModelCheckpoint("best_galaxy_cnn_v2.keras", monitor="val_accuracy", save_best_only=True)

history = model.fit(
    train_gen,
    epochs=10,
    validation_data=val_gen,
    callbacks=[early_stop, lr_reduce, checkpoint]
)


  self._warn_if_super_not_called()


Epoch 1/10
[1m1694/1694[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1379s[0m 813ms/step - accuracy: 0.5573 - loss: 1.1446 - val_accuracy: 0.6724 - val_loss: 1.4810 - learning_rate: 1.0000e-04
Epoch 2/10
[1m1694/1694[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1333s[0m 787ms/step - accuracy: 0.6678 - loss: 0.8506 - val_accuracy: 0.6583 - val_loss: 1.3148 - learning_rate: 1.0000e-04
Epoch 3/10
[1m1694/1694[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1354s[0m 799ms/step - accuracy: 0.6870 - loss: 0.7910 - val_accuracy: 0.7057 - val_loss: 1.2113 - learning_rate: 1.0000e-04
Epoch 4/10
[1m1694/1694[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1393s[0m 822ms/step - accuracy: 0.6983 - loss: 0.7611 - val_accuracy: 0.7315 - val_loss: 0.8921 - learning_rate: 1.0000e-04
Epoch 5/10
[1m1694/1694[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1378s[0m 813ms/step - accuracy: 0.7097 - loss: 0.7370 - val_accuracy: 0.7217 - val_loss: 0.8806 - learning_rate: 1.0000e-04
Epoch 6/10
[1m

In [None]:
# 11) evaluate on test set by finding Micro and Macro F1 values 
model.load_weights("best_galaxy_cnn_v2.keras")
test_loss, test_acc = model.evaluate(test_gen, verbose=1)
print(f"\nTest Accuracy: {test_acc:.4f}   |   Test Loss: {test_loss:.4f}\n")

# Get raw predictions on test set
pred_probs = model.predict(test_gen, verbose=1)
y_pred = np.argmax(pred_probs, axis=1)
y_true = test_gen.classes

# Reconstruct the class‐index -> label mapping
class_indices = train_gen.class_indices       # e.g. {'elliptical': 0, 'irregular': 1, 'spiral': 2}
inv_class_indices = {v: k for k, v in class_indices.items()}
target_names = [inv_class_indices[i] for i in range(len(inv_class_indices))]

# Print detailed classification report
print("Classification Report:\n")
print(classification_report(y_true, y_pred, target_names=target_names))

# Compute Micro and Macro F1
micro_f1 = f1_score(y_true, y_pred, average="micro")
macro_f1 = f1_score(y_true, y_pred, average="macro")
print(f"Micro-F1: {micro_f1:.4f}")
print(f"Macro-F1: {macro_f1:.4f}\n")

# Show the confusion matrix
cm = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:\n", cm)


[1m424/424[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m119s[0m 281ms/step - accuracy: 0.7189 - loss: 0.8242

Test Accuracy: 0.7152   |   Test Loss: 0.8261

[1m424/424[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m90s[0m 213ms/step
Classification Report:

              precision    recall  f1-score   support

  elliptical       0.77      0.89      0.82      8437
   irregular       0.57      0.05      0.09      2100
      spiral       0.58      0.70      0.63      3012

    accuracy                           0.72     13549
   macro avg       0.64      0.54      0.52     13549
weighted avg       0.69      0.72      0.67     13549

Micro-F1: 0.7152
Macro-F1: 0.5156

Confusion Matrix:
 [[7489   37  911]
 [1400  102  598]
 [ 872   41 2099]]
