In [None]:
import numpy as np
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers, models, optimizers
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

# 1)

In [None]:
# Load the CSV
csv_path = "gz_decals_volunteers_5.csv"
df = pd.read_csv(csv_path)

df.columns

# Create a new column, “filename”, that matches the .png file
df["filename"] = df["iauname"].astype(str) + ".png"

# Walk through the image folders, collect full paths + “basename”
root = "gz_decals_dr5_png_part1"

all_rows = []   # build a dataframe 
for subdir in os.listdir(root):
    folder = os.path.join(root, subdir)
    if not os.path.isdir(folder):
        continue
    for fname in os.listdir(folder):
        if not fname.lower().endswith(".png"):
            continue
        fullpath = os.path.join(folder, fname)
        all_rows.append({
            "filename": fname,
            "image_path": fullpath
        })

imgs_df = pd.DataFrame(all_rows)

#  merge on “filename”:
merged = imgs_df.merge(df, on="filename", how="left")

# Check how many matched:
print(f"Total PNGs found: {len(imgs_df)}")
print(f"Total matched rows after merging with CSV: {len(merged.dropna(subset=['iauname']))}")

# Inspect rows:
print(merged[["image_path", "iauname",
              "smooth-or-featured_smooth_fraction",
              "has-spiral-arms_yes_fraction"]].head())


Total PNGs found: 91481
Total matched rows after merging with CSV: 71883
                                          image_path              iauname  \
0  gz_decals_dr5_png_part1/J090/J090754.26+282647...  J090754.26+282647.0   
1  gz_decals_dr5_png_part1/J090/J090135.99-004032...  J090135.99-004032.7   
2  gz_decals_dr5_png_part1/J090/J090007.90+165526...  J090007.90+165526.4   
3  gz_decals_dr5_png_part1/J090/J090545.36+020501...  J090545.36+020501.2   
4  gz_decals_dr5_png_part1/J090/J090157.31+123804...  J090157.31+123804.1   

   smooth-or-featured_smooth_fraction  has-spiral-arms_yes_fraction  
0                            0.800000                           NaN  
1                            1.000000                           NaN  
2                            0.000000                          1.00  
3                            0.736842                          0.25  
4                            0.600000                          0.50  


In [None]:
def decide_label(row):
    # If there's no volunteer info (NaN), we could mark as “unlabeled” or directly as “irregular”
    if pd.isna(row["smooth-or-featured_smooth_fraction"]) and pd.isna(row["has-spiral-arms_yes_fraction"]):
        return "unlabeled"
    #
    # 1) If “smooth” vote > 0.7 then elliptical
    if row["smooth-or-featured_smooth_fraction"] >= 0.7:
        return "elliptical"
    # 2) Else if “spiral” vote > 0.7 then spiral
    if row["has-spiral-arms_yes_fraction"] >= 0.7:
        return "spiral"
    # 3) Otherwise - irregular
    return "irregular"

merged["label"] = merged.apply(decide_label, axis=1)

# Now see how many ended up in each category:
print(merged["label"].value_counts(dropna=False))


label
elliptical    29209
irregular     28094
unlabeled     19598
spiral        14580
Name: count, dtype: int64


In [None]:
# Remove rows without any volunteer votes:
merged = merged[merged["label"] != "unlabeled"].reset_index(drop=True)

In [None]:
df_temp, df_test = train_test_split(
    merged,
    test_size=0.15,
    stratify=merged["label"],
    random_state=42
)

df_train, df_val = train_test_split(
    df_temp,
    test_size=0.1765,            
    stratify=df_temp["label"],
    random_state=42
)
 
print("Train size:", len(df_train), "Val size:", len(df_val), "Test size:", len(df_test))
print("Label distribution in train:\n", df_train["label"].value_counts(normalize=True))
print("Label distribution in val:\n", df_val["label"].value_counts(normalize=True))
print("Label distribution in test:\n", df_test["label"].value_counts(normalize=True))


Train size: 50315 Val size: 10785 Test size: 10783
Label distribution in train:
 label
elliptical    0.406340
irregular     0.390838
spiral        0.202822
Name: proportion, dtype: float64
Label distribution in val:
 label
elliptical    0.406305
irregular     0.390821
spiral        0.202874
Name: proportion, dtype: float64
Label distribution in test:
 label
elliptical    0.406380
irregular     0.390800
spiral        0.202819
Name: proportion, dtype: float64


In [None]:
# Define your augmentations for training
train_datagen = ImageDataGenerator(
    rescale=1.0/255,
    rotation_range=360,
    width_shift_range=0.1,
    height_shift_range=0.1,
    zoom_range=0.1,
    horizontal_flip=True,
    vertical_flip=True
)

# For validation / test, typically only rescale
val_datagen = ImageDataGenerator(rescale=1.0/255)

# Create generators:
train_gen = train_datagen.flow_from_dataframe(
    dataframe=df_train,
    x_col="image_path",
    y_col="label",
    target_size=(224, 224),     
    batch_size=32,
    class_mode="categorical"     # because we have 3 classes (“elliptical”, “spiral”, “irregular”)
)

val_gen = val_datagen.flow_from_dataframe(
    dataframe=df_val,
    x_col="image_path",
    y_col="label",
    target_size=(224, 224),
    batch_size=32,
    class_mode="categorical"
)

test_gen = val_datagen.flow_from_dataframe(
    dataframe=df_test,
    x_col="image_path",
    y_col="label",
    target_size=(224, 224),
    batch_size=32,
    class_mode="categorical",
    shuffle=False   # Typically shuffle=False for test so you can correlate predictions to filenames
)


Found 50315 validated image filenames belonging to 3 classes.
Found 10785 validated image filenames belonging to 3 classes.
Found 10783 validated image filenames belonging to 3 classes.


In [None]:
# 1) Define a model 
def build_simple_cnn(input_shape=(224,224,3), num_classes=3):
    m = models.Sequential()
    m.add(layers.Conv2D(32, (3,3), activation="relu", input_shape=input_shape))
    m.add(layers.BatchNormalization())
    m.add(layers.MaxPooling2D((2,2)))
    m.add(layers.Dropout(0.25))

    m.add(layers.Conv2D(64, (3,3), activation="relu"))
    m.add(layers.BatchNormalization())
    m.add(layers.MaxPooling2D((2,2)))
    m.add(layers.Dropout(0.25))

    m.add(layers.Conv2D(128, (3,3), activation="relu"))
    m.add(layers.BatchNormalization())
    m.add(layers.MaxPooling2D((2,2)))
    m.add(layers.Dropout(0.25))

    m.add(layers.Flatten())
    m.add(layers.Dense(256, activation="relu"))
    m.add(layers.BatchNormalization())
    m.add(layers.Dropout(0.5))
    m.add(layers.Dense(num_classes, activation="softmax"))
    return m

model = build_simple_cnn(input_shape=(224,224,3), num_classes=3)
model.summary()

# 2) Compile
model.compile(
    optimizer=optimizers.Adam(learning_rate=1e-4),
    loss="categorical_crossentropy",
    metrics=["accuracy"]
)

# 3) Callbacks
early_stop = EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)
lr_reduce = ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=3, min_lr=1e-7)
checkpoint = ModelCheckpoint("best_galaxy_cnn.h5", monitor="val_accuracy", save_best_only=True)

# 4) Train
history = model.fit(
    train_gen,
    epochs=30,
    validation_data=val_gen,
    callbacks=[early_stop, lr_reduce, checkpoint]
)

# 5) Evaluate on test set
model.load_weights("best_galaxy_cnn.h5")
test_loss, test_acc = model.evaluate(test_gen)
print(f"Test accuracy: {test_acc:.4f}")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


  self._warn_if_super_not_called()


Epoch 1/30
[1m 651/1573[0m [32m━━━━━━━━[0m[37m━━━━━━━━━━━━[0m [1m13:30[0m 879ms/step - accuracy: 0.4407 - loss: 1.4410

KeyboardInterrupt: 