# 02 - Preprocessing
Prepare resized/subsampled versions via `Model/training/preprocess.py`. Goal: speed up training and test different `IMAGE_SIZE` values.

## Explicit configuration
Fix the project root and interpreter (venv) deterministically to avoid surprises when moving the notebook. Adjust `IMAGE_SIZE`, `LIMIT_PER_CLASS`, or `CLASSES` before running.

In [1]:
# --- Standard imports ---
from pathlib import Path
import subprocess
import sys
# --- Explicit project root ---
ROOT = Path(r"C:\Users\lmanuelli\Projet\tp_bihar")
# --- Python interpreter from venv ---
PYTHON = ROOT / ".venv" / "Scripts" / "python.exe"
# --- Project paths (absolute) ---
SCRIPT = ROOT / "Model" / "training" / "preprocess.py"
DATA_RAW = ROOT / "Data" / "raw"
DATA_PROCESSED = ROOT / "Data" / "processed"
# --- Parameters ---
IMAGE_SIZE = 224
LIMIT_PER_CLASS = None
# --- Safety: ensure output directory exists ---
DATA_PROCESSED.mkdir(parents=True, exist_ok=True)
CLASSES = ['Chao','Ervas','Milho','Milho_ervas']

## Run the preprocessing script
Build the CLI command, print it, and surface stdout/stderr for traceability with clear error handling.

In [4]:
# --- Build command ---
cmd = [
        str(PYTHON),
        str(SCRIPT),
        "--input-dir", str(DATA_RAW),
        "--output-dir", str(DATA_PROCESSED),
        "--size", str(IMAGE_SIZE),
    ]

if LIMIT_PER_CLASS is not None:
    cmd += ["--limit-per-class", str(LIMIT_PER_CLASS)]

    # --- Readable debug ---
    print("Command:")
    print(" ".join(cmd))

    # --- Execute ---
    result = subprocess.run(
        cmd,
        capture_output=True,
        text=True
    )

    # --- Explicit error handling ---
    if result.returncode != 0:
        print("[ERROR] Preprocessing failed")
        print("STDERR:", result.stderr)
        raise RuntimeError("Preprocess failed")

        print("[ERROR] Preprocessing completed")
        print("STDOUT:", result.stdout)

In [5]:
PROC_TRAIN = DATA_PROCESSED / f'train_{IMAGE_SIZE}'
if 'CLASSES' not in globals():
    CLASSES = ['Chao','Ervas','Milho','Milho_ervas']
if not PROC_TRAIN.exists():
    print('processed train folder not found:', PROC_TRAIN)
else:
    counts = {cls: len(list((PROC_TRAIN/cls).glob('*'))) for cls in CLASSES}
    print('processed train counts', counts)

processed train counts {'Chao': 6134, 'Ervas': 6015, 'Milho': 6255, 'Milho_ervas': 6040}
