
# ITFFC — Preprocessing Notebook (Images → PNG + Cleaned Folders)

This notebook performs **image preprocessing** on your dataset located at:

```
C:\Users\bacht\Desktop\Master2_S1\ITFFC\Dataset
```

### What it does
- Scans all subfolders under `Dataset` (e.g., `logo`, `medical`, ...), and then each **class** subfolder (e.g., `Normal`, `Lung_Opacity`, `Viral Pneumonia`, `NonDemented`, `MildDemented`, etc.).  
- For **each class folder**, creates a sibling folder named `<class>_processed` (e.g., `Normal_processed`).  
- Loads images, applies minimal, safe preprocessing, and saves **PNG** versions with compression to reduce space.
- Writes a **manifest CSV** with basic metadata (original path, processed path, width/height).  
- Skips files that are not images and handles errors gracefully (logged to a CSV).

> You can re-run cells safely; the code is **idempotent** (it won't reprocess already-existing PNGs unless `force_rewrite=True`).

---

### Default preprocessing
- Ensure RGB format (or single-channel converted to RGB).
- Optional resize (keep aspect ratio) so the **longest side = 512 px** (tweakable).
- Optional **CLAHE** (contrast-limited adaptive histogram equalization) on luminance — helpful for X-ray-like medical images; can be toggled per "domain".
- Save to PNG with compression.


In [None]:

# =========================
# Configuration
# =========================

# IMPORTANT: Use a raw string for Windows paths (prefix r)
DATASET_ROOT = r"C:\Users\bacht\Desktop\Master2_S1\ITFFC\Dataset"

# Preprocessing parameters
long_side = 512           # Max size for the longer side (set None to disable resizing)
apply_clahe_medical = True  # Apply CLAHE for medical images
apply_clahe_logo = False    # Usually not needed for logo images

# File handling
valid_exts = {".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff"}
force_rewrite = False        # If True, overwrite even if target exists
png_compress_level = 6       # 0 (none) .. 9 (max)

# Parallelism
num_workers = 0              # 0 or 1 = no multiprocessing; set to >1 for speed


In [None]:

# =========================
# Imports
# =========================
import os
import sys
import csv
import math
import traceback
from pathlib import Path
from functools import partial

from PIL import Image, ImageOps
import numpy as np

try:
    import cv2  # For CLAHE
    _HAS_CV2 = True
except Exception:
    _HAS_CV2 = False

from tqdm import tqdm

print("Python:", sys.version)
print("PIL version:", Image.__version__)
print("cv2 available:", _HAS_CV2)


In [None]:

# =========================
# Utilities
# =========================

def is_image_file(path: Path):
    return path.suffix.lower() in valid_exts

def ensure_rgb(img: Image.Image) -> Image.Image:
    if img.mode in ["RGB", "RGBA"]:
        return img.convert("RGB")
    return img.convert("RGB")

def resize_keep_aspect(img: Image.Image, max_side: int) -> Image.Image:
    if max_side is None or max_side <= 0:
        return img
    w, h = img.size
    m = max(w, h)
    if m <= max_side:
        return img
    scale = max_side / float(m)
    new_w = max(1, int(round(w * scale)))
    new_h = max(1, int(round(h * scale)))
    return img.resize((new_w, new_h), Image.BICUBIC)

def apply_clahe_rgb(img: Image.Image) -> Image.Image:
    # Apply CLAHE on luminance (LAB space). Requires OpenCV.
    # If cv2 is not available, returns the input unchanged.
    if not _HAS_CV2:
        return img
    arr = np.array(img.convert("RGB"))
    lab = cv2.cvtColor(arr, cv2.COLOR_RGB2LAB)
    L, A, B = cv2.split(lab)
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
    L2 = clahe.apply(L)
    lab2 = cv2.merge((L2, A, B))
    rgb2 = cv2.cvtColor(lab2, cv2.COLOR_LAB2RGB)
    return Image.fromarray(rgb2)

def save_png(img: Image.Image, out_path: Path, compress_level: int = 6):
    out_path.parent.mkdir(parents=True, exist_ok=True)
    img.save(out_path, format="PNG", optimize=True, compress_level=int(compress_level))

def decide_domain(path_in_dataset: Path) -> str:
    # Returns 'medical' if 'medical' appears in path, 'logo' if 'logo' appears, else 'generic'.
    parts = [p.lower() for p in path_in_dataset.parts]
    if any("med" in p for p in parts):  # tolerant of 'medcial' typo
        return "medical"
    if any("logo" in p for p in parts):
        return "logo"
    return "generic"

def processed_name(class_dir: Path) -> Path:
    # Return sibling directory path with '_processed' suffix.
    return class_dir.with_name(f"{class_dir.name}_processed")

def gather_class_dirs(dataset_root: Path):
    # Walks Dataset root and returns a list of leaf 'class' directories that contain images.
    # E.g., .../Dataset/medical/covid19/Normal, etc.
    class_dirs = []
    for p in dataset_root.rglob("*"):
        if p.is_dir():
            entries = list(p.iterdir())
            has_images = any(is_image_file(x) for x in entries if x.is_file())
            has_subdirs = any(x.is_dir() for x in entries)
            if has_images and not has_subdirs:
                class_dirs.append(p)
    return sorted(class_dirs)

def rel_to_root(path: Path, root: Path) -> Path:
    try:
        return path.relative_to(root)
    except Exception:
        return path


In [None]:

# =========================
# Main processing
# =========================

root = Path(DATASET_ROOT)
assert root.exists(), f"DATASET_ROOT not found: {root}"

class_dirs = gather_class_dirs(root)
print(f"Found {len(class_dirs)} class folders (leaf dirs with images):")
for d in class_dirs:
    print(" -", rel_to_root(d, root))

manifest_rows = []
error_rows = []

def process_one(src_path: Path, dst_path: Path, domain: str):
    # Load
    img = Image.open(src_path)
    img = ensure_rgb(img)
    # Domain-specific steps
    if long_side and long_side > 0:
        img = resize_keep_aspect(img, long_side)
    if domain == "medical" and apply_clahe_medical:
        img = apply_clahe_rgb(img)
    elif domain == "logo" and apply_clahe_logo:
        img = apply_clahe_rgb(img)
    # Save
    save_png(img, dst_path, png_compress_level)
    return img.size

for class_dir in class_dirs:
    out_dir = processed_name(class_dir)
    out_dir.mkdir(parents=True, exist_ok=True)
    domain = decide_domain(rel_to_root(class_dir, root))
    files = [p for p in class_dir.iterdir() if p.is_file() and is_image_file(p)]
    print(f"\\nProcessing {rel_to_root(class_dir, root)}  ->  {rel_to_root(out_dir, root)}  ({len(files)} files)")

    for src in tqdm(files):
        dst = out_dir / (src.stem + ".png")
        if dst.exists() and not force_rewrite:
            # Collect existing metadata quickly (skip loading)
            try:
                with Image.open(dst) as _im:
                    w, h = _im.size
            except Exception:
                w = h = -1
            manifest_rows.append([str(src), str(dst), w, h, domain, "skipped_exists"])
            continue

        try:
            w, h = process_one(src, dst, domain)
            manifest_rows.append([str(src), str(dst), w, h, domain, "ok"])
        except Exception as e:
            error_rows.append([str(src), str(dst), repr(e)])
            print("Error on:", src, "->", e)
            traceback.print_exc()

# Write manifests
manif_path = root / "preprocess_manifest.csv"
errs_path = root / "preprocess_errors.csv"
with open(manif_path, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["src", "dst", "width", "height", "domain", "status"])
    writer.writerows(manifest_rows)

with open(errs_path, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["src", "dst", "error"])
    writer.writerows(error_rows)

print("\\nDone.")
print("Manifest:", manif_path)
print("Errors  :", errs_path, "(may be empty)")



## Tips

- If you see **file permission** errors on Windows, try running this notebook from a local Python environment with write access to your dataset folders.
- To **disable resizing**, set `long_side = None` in the config cell and rerun.
- To **reprocess** all images again, set `force_rewrite = True`.
- The script infers the domain ("medical" vs. "logo") from directory names that contain `med` or `logo`. You can change that logic in `decide_domain`.
