In [1]:
# 02_Data_Preprocessing.py
# Run in notebooks/02_Data_Preprocessing.ipynb (as cells) or as a script

import os
import json
import pathlib
import tensorflow as tf

# CONFIG
PROJECT_ROOT = pathlib.Path("..")
DATA_DIR = PROJECT_ROOT / "data" / "raw" / "PlantVillage"
MODELS_DIR = PROJECT_ROOT / "models"
PROCESSED_DIR = PROJECT_ROOT / "data" / "processed"

BATCH_SIZE = 32
IMG_SIZE = (224, 224)
SEED = 42

os.makedirs(MODELS_DIR, exist_ok=True)
os.makedirs(PROCESSED_DIR, exist_ok=True)

print("Data dir:", DATA_DIR)
assert DATA_DIR.exists(), f"Data dir {DATA_DIR} not found"

# Create tf.data datasets (image_dataset_from_directory returns uint8 images 0-255)
train_ds = tf.keras.utils.image_dataset_from_directory(
    str(DATA_DIR),
    validation_split=0.2,
    subset="training",
    seed=SEED,
    image_size=IMG_SIZE,
    batch_size=BATCH_SIZE
)

val_ds = tf.keras.utils.image_dataset_from_directory(
    str(DATA_DIR),
    validation_split=0.2,
    subset="validation",
    seed=SEED,
    image_size=IMG_SIZE,
    batch_size=BATCH_SIZE
)

# Save class names mapping to models folder for later inference
class_names = train_ds.class_names
with open(MODELS_DIR / "class_names.json", "w", encoding="utf-8") as f:
    json.dump(class_names, f, indent=2)
print("Saved class_names.json with", len(class_names), "classes.")

# Cache & prefetch for performance
AUTOTUNE = tf.data.AUTOTUNE
train_ds = train_ds.cache().shuffle(1000).prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

# Optional: Save processed datasets (TF Dataset .save); use stable API
# Save after resizing/batching (image_dataset_from_directory already batches and resizes)
tf.data.Dataset.save(train_ds, str(PROCESSED_DIR / "train_ds"))
tf.data.Dataset.save(val_ds, str(PROCESSED_DIR / "val_ds"))
print("Saved processed datasets to", PROCESSED_DIR)


Data dir: ..\data\raw\PlantVillage
Found 20638 files belonging to 15 classes.
Using 16511 files for training.
Found 20638 files belonging to 15 classes.
Using 4127 files for validation.
Saved class_names.json with 15 classes.
Saved processed datasets to ..\data\processed
