In [61]:
import os
import pandas as pd
from pathlib import Path

def create_subset(input_csv, output_csv, image_dir, n=64):
    # 1) Read & filter
    df = pd.read_csv(input_csv)
    df = df.dropna(subset=["iauname", "label", "petroRad_r_psf"])

    # 2) Map iauname → actual filename
    df["filename"] = df["iauname"].apply(lambda x: os.path.basename(x))

    # 3) Build mapping filename → full path
    all_image_paths = list(Path(image_dir).rglob("*.png"))
    filename_to_path = {p.name: str(p) for p in all_image_paths}

    # 4) Attach filepaths, drop any rows where the image is missing
    df["filepath"] = df["filename"].map(filename_to_path)
    df = df[df["filepath"].notnull()].reset_index(drop=True)

    # Sanity check
    assert df["filepath"].apply(os.path.exists).all(), \
        "One or more filepaths do not actually exist!"

    # 5) Sample
    subset = df.sample(n=min(n, len(df)), random_state=42).reset_index(drop=True)

    # 6) Only save the columns your training script needs (*including* filepath!)
    cols_to_keep = ["filepath", "label", "petroRad_r_psf"]
    subset.to_csv(output_csv, index=False, columns=cols_to_keep)
    print(f"Subset saved to: {output_csv} with {len(subset)} rows")

# usage
BASE_DIR   = os.getcwd()
IMAGE_DIR  = "/Users/apple/Desktop/Galaxy_research/image/"
TRAIN_CSV  = "csv_files/df_train_plus_petroRad_r.csv"
VALID_CSV  = "csv_files/df_valid_plus_petroRad_r.csv"

SMALL_TRAIN_CSV = os.path.join(BASE_DIR, "df_train_small.csv")
SMALL_VALID_CSV = os.path.join(BASE_DIR, "df_valid_small.csv")

create_subset(TRAIN_CSV, SMALL_TRAIN_CSV, IMAGE_DIR, 64)
create_subset(VALID_CSV, SMALL_VALID_CSV, IMAGE_DIR, 64)

Subset saved to: /Users/apple/Desktop/Galaxy_research/code/df_train_small.csv with 64 rows
Subset saved to: /Users/apple/Desktop/Galaxy_research/code/df_valid_small.csv with 64 rows


In [63]:
import pandas as pd

df = pd.read_csv("df_train_small.csv")
print("Using", len(df), "samples – first few filepaths:")
print(df["filepath"].head())

Using 64 samples – first few filepaths:
0    /Users/apple/Desktop/Galaxy_research/image/J13...
1    /Users/apple/Desktop/Galaxy_research/image/J21...
2    /Users/apple/Desktop/Galaxy_research/image/J21...
3    /Users/apple/Desktop/Galaxy_research/image/J12...
4    /Users/apple/Desktop/Galaxy_research/image/J08...
Name: filepath, dtype: object


In [71]:
import os
import pandas as pd
from pathlib import Path

def create_subset(input_csv, output_csv, image_dir, n=64):
    # 1) Read & filter
    df = pd.read_csv(input_csv)
    df = df.dropna(subset=["iauname", "label", "petroRad_r_psf"])

    # 2) Extract just the filename (what your generator calls `iauname`)
    df["filename"] = df["iauname"].apply(lambda x: os.path.basename(x))

    # 3) Build mapping filename → full path (to verify existence)
    all_image_paths = list(Path(image_dir).rglob("*.png"))
    filename_to_path = {p.name: str(p) for p in all_image_paths}

    # 4) Attach filepaths and drop any where the image is missing
    df["filepath"] = df["filename"].map(filename_to_path)
    df = df[df["filepath"].notnull()].reset_index(drop=True)

    # 5) Sanity check
    assert df["filepath"].apply(os.path.exists).all(), \
        "One or more filepaths do not actually exist!"

    # 6) Sample
    subset = df.sample(n=min(n, len(df)), random_state=42).reset_index(drop=True)

    # 7) Copy filename → iauname so your training script still finds `iauname`
    subset["iauname"] = subset["filename"]

    # 8) Save only the columns your script needs:
    #    - iauname: (just a png basename, so generator can do os.path.join(IMAGE_DIR, iauname))
    #    - label
    #    - petroRad_r_psf
    cols_to_keep = ["iauname", "label", "petroRad_r_psf"]
    subset.to_csv(output_csv, index=False, columns=cols_to_keep)
    print(f"Subset saved to: {output_csv} with {len(subset)} rows")


# usage
BASE_DIR       = os.getcwd()
IMAGE_DIR      = "/Users/apple/Desktop/Galaxy_research/image/"
TRAIN_CSV      = "csv_files/df_train_plus_petroRad_r.csv"
VALID_CSV      = "csv_files/df_valid_plus_petroRad_r.csv"

SMALL_TRAIN_CSV = os.path.join(BASE_DIR, "df_train_small.csv")
SMALL_VALID_CSV = os.path.join(BASE_DIR, "df_valid_small.csv")

create_subset(TRAIN_CSV, SMALL_TRAIN_CSV, IMAGE_DIR, 64)
create_subset(VALID_CSV, SMALL_VALID_CSV, IMAGE_DIR, 64)

Subset saved to: /Users/apple/Desktop/Galaxy_research/code/df_train_small.csv with 64 rows
Subset saved to: /Users/apple/Desktop/Galaxy_research/code/df_valid_small.csv with 64 rows
