In [1]:
import tensorflow as tf
import pandas as pd
from pathlib import Path

In [2]:
def make_dataset(split_dir, img_size=(1400, 1920), batch_size=32, shuffle=True):
    split_dir = Path(split_dir)
    csv_path = split_dir / "_classes.csv"

    # 1) Read CSV
    df = pd.read_csv(csv_path)

    # filename column + binary label columns
    filepaths = df["filename"].apply(lambda fname: str(split_dir / fname)).values
    labels = df.drop(columns=["filename"]).values.astype("float32")  # shape: [N, num_labels]

    num_classes = labels.shape[1]

    # 2) Build base tf.data.Dataset from (path, label-vector)
    ds = tf.data.Dataset.from_tensor_slices((filepaths, labels))

    # 3) Map paths -> decoded & resized images
    def load_image(path, label):
        img = tf.io.read_file(path)
        img = tf.image.decode_jpeg(img, channels=3)
        img = tf.image.resize(img, img_size)
        img = tf.cast(img, tf.float32) / 255.0   # normalize to [0,1]
        return img, label

    ds = ds.map(load_image, num_parallel_calls=tf.data.AUTOTUNE)

    # 4) Shuffle / batch / prefetch
    if shuffle:
        ds = ds.shuffle(buffer_size=len(filepaths))

    ds = ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)

    return ds, num_classes


In [3]:
train_ds, num_classes = make_dataset("dataset/train", batch_size=32, shuffle=True)
valid_ds, _          = make_dataset("dataset/valid", batch_size=32, shuffle=False)
test_ds, _           = make_dataset("dataset/test",  batch_size=32, shuffle=False)