
# ITFFC — Preprocessing Notebook (Images → PNG + Cleaned Folders)

This notebook performs **image preprocessing** on the dataset located at:

```
C:\Users\bacht\Desktop\Master2_S1\ITFFC\Dataset
```

### What it does
- Scans all subfolders under `Dataset` (e.g., `logo`, `medical`, ...), and then each **class** subfolder (e.g., `Normal`, `Lung_Opacity`, `Viral Pneumonia`, `NonDemented`, `MildDemented`, etc.).  
- For **each class folder**, creates a sibling folder named `<class>_processed` (e.g., `Normal_processed`).  
- Loads images, applies minimal, safe preprocessing, and saves **PNG** versions with compression to reduce space.
- Writes a **manifest CSV** with basic metadata (original path, processed path, width/height).  
- Skips files that are not images and handles errors gracefully (logged to a CSV).


In [1]:

# =========================
# Configuration
# =========================

# IMPORTANT: Use a raw string for Windows paths (prefix r)
DATASET_ROOT = r"C:\Users\bacht\Desktop\Master2_S1\ITFFC\Dataset"

# Preprocessing parameters
long_side = 512         
apply_clahe_medical = True  
apply_clahe_logo = False    

# File handling
valid_exts = {".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff"}
force_rewrite = False        
png_compress_level = 6       

# Parallelism
num_workers = 0             


In [2]:

# =========================
# Imports
# =========================
import os
import sys
import csv
import math
import traceback
from pathlib import Path
from functools import partial

from PIL import Image, ImageOps
import numpy as np

try:
    import cv2  # For CLAHE
    _HAS_CV2 = True
except Exception:
    _HAS_CV2 = False

from tqdm import tqdm

print("Python:", sys.version)
print("PIL version:", Image.__version__)
print("cv2 available:", _HAS_CV2)


Python: 3.13.5 | packaged by Anaconda, Inc. | (main, Jun 12 2025, 16:37:03) [MSC v.1929 64 bit (AMD64)]
PIL version: 11.1.0
cv2 available: True


In [3]:

# =========================
# Utilities
# =========================

def is_image_file(path: Path):
    return path.suffix.lower() in valid_exts

def ensure_rgb(img: Image.Image) -> Image.Image:
    if img.mode in ["RGB", "RGBA"]:
        return img.convert("RGB")
    return img.convert("RGB")

def resize_keep_aspect(img: Image.Image, max_side: int) -> Image.Image:
    if max_side is None or max_side <= 0:
        return img
    w, h = img.size
    m = max(w, h)
    if m <= max_side:
        return img
    scale = max_side / float(m)
    new_w = max(1, int(round(w * scale)))
    new_h = max(1, int(round(h * scale)))
    return img.resize((new_w, new_h), Image.BICUBIC)

def apply_clahe_rgb(img: Image.Image) -> Image.Image:
    
   
    if not _HAS_CV2:
        return img
    arr = np.array(img.convert("RGB"))
    lab = cv2.cvtColor(arr, cv2.COLOR_RGB2LAB)
    L, A, B = cv2.split(lab)
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
    L2 = clahe.apply(L)
    lab2 = cv2.merge((L2, A, B))
    rgb2 = cv2.cvtColor(lab2, cv2.COLOR_LAB2RGB)
    return Image.fromarray(rgb2)

def save_png(img: Image.Image, out_path: Path, compress_level: int = 6):
    out_path.parent.mkdir(parents=True, exist_ok=True)
    img.save(out_path, format="PNG", optimize=True, compress_level=int(compress_level))

def decide_domain(path_in_dataset: Path) -> str:
 
    parts = [p.lower() for p in path_in_dataset.parts]
    if any("med" in p for p in parts):  
        return "medical"
    if any("logo" in p for p in parts):
        return "logo"
    return "generic"

def processed_name(class_dir: Path) -> Path:
    # Return sibling directory path with '_processed' suffix.
    return class_dir.with_name(f"{class_dir.name}_processed")

def gather_class_dirs(dataset_root: Path):
    # Walks Dataset root and returns a list of leaf 'class' directories that contain images.
    # E.g., .../Dataset/medical/covid19/Normal, etc.
    class_dirs = []
    for p in dataset_root.rglob("*"):
        if p.is_dir():
            entries = list(p.iterdir())
            has_images = any(is_image_file(x) for x in entries if x.is_file())
            has_subdirs = any(x.is_dir() for x in entries)
            if has_images and not has_subdirs:
                class_dirs.append(p)
    return sorted(class_dirs)

def rel_to_root(path: Path, root: Path) -> Path:
    try:
        return path.relative_to(root)
    except Exception:
        return path


In [4]:

# =========================
# Main processing
# =========================

root = Path(DATASET_ROOT)
assert root.exists(), f"DATASET_ROOT not found: {root}"

class_dirs = gather_class_dirs(root)
print(f"Found {len(class_dirs)} class folders (leaf dirs with images):")
for d in class_dirs:
    print(" -", rel_to_root(d, root))

manifest_rows = []
error_rows = []

def process_one(src_path: Path, dst_path: Path, domain: str):
    # Load
    img = Image.open(src_path)
    img = ensure_rgb(img)
    # Domain-specific steps
    if long_side and long_side > 0:
        img = resize_keep_aspect(img, long_side)
    if domain == "medical" and apply_clahe_medical:
        img = apply_clahe_rgb(img)
    elif domain == "logo" and apply_clahe_logo:
        img = apply_clahe_rgb(img)
    # Save
    save_png(img, dst_path, png_compress_level)
    return img.size

for class_dir in class_dirs:
    out_dir = processed_name(class_dir)
    out_dir.mkdir(parents=True, exist_ok=True)
    domain = decide_domain(rel_to_root(class_dir, root))
    files = [p for p in class_dir.iterdir() if p.is_file() and is_image_file(p)]
    print(f"\\nProcessing {rel_to_root(class_dir, root)}  ->  {rel_to_root(out_dir, root)}  ({len(files)} files)")

    for src in tqdm(files):
        dst = out_dir / (src.stem + ".png")
        if dst.exists() and not force_rewrite:
            # Collect existing metadata quickly (skip loading)
            try:
                with Image.open(dst) as _im:
                    w, h = _im.size
            except Exception:
                w = h = -1
            manifest_rows.append([str(src), str(dst), w, h, domain, "skipped_exists"])
            continue

        try:
            w, h = process_one(src, dst, domain)
            manifest_rows.append([str(src), str(dst), w, h, domain, "ok"])
        except Exception as e:
            error_rows.append([str(src), str(dst), repr(e)])
            print("Error on:", src, "->", e)
            traceback.print_exc()

# Write manifests
manif_path = root / "preprocess_manifest.csv"
errs_path = root / "preprocess_errors.csv"
with open(manif_path, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["src", "dst", "width", "height", "domain", "status"])
    writer.writerows(manifest_rows)

with open(errs_path, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["src", "dst", "error"])
    writer.writerows(error_rows)

print("\\nDone.")
print("Manifest:", manif_path)
print("Errors  :", errs_path, "(may be empty)")


Found 8 class folders (leaf dirs with images):
 - logo\Logos
 - medcial\alzheimer\MildDemented
 - medcial\alzheimer\ModerateDemented
 - medcial\alzheimer\NonDemented
 - medcial\alzheimer\VeryMildDemented
 - medcial\covid19\Lung_Opacity\images
 - medcial\covid19\Normal\images
 - medcial\covid19\Viral Pneumonia\images
\nProcessing logo\Logos  ->  logo\Logos_processed  (1435 files)


100%|██████████| 1435/1435 [00:50<00:00, 28.33it/s]


\nProcessing medcial\alzheimer\MildDemented  ->  medcial\alzheimer\MildDemented_processed  (896 files)


100%|██████████| 896/896 [00:22<00:00, 39.55it/s]


\nProcessing medcial\alzheimer\ModerateDemented  ->  medcial\alzheimer\ModerateDemented_processed  (64 files)


100%|██████████| 64/64 [00:01<00:00, 38.98it/s]


\nProcessing medcial\alzheimer\NonDemented  ->  medcial\alzheimer\NonDemented_processed  (3200 files)


100%|██████████| 3200/3200 [01:24<00:00, 37.87it/s]


\nProcessing medcial\alzheimer\VeryMildDemented  ->  medcial\alzheimer\VeryMildDemented_processed  (2240 files)


100%|██████████| 2240/2240 [00:58<00:00, 38.38it/s]


\nProcessing medcial\covid19\Lung_Opacity\images  ->  medcial\covid19\Lung_Opacity\images_processed  (6012 files)


100%|██████████| 6012/6012 [11:06<00:00,  9.02it/s]


\nProcessing medcial\covid19\Normal\images  ->  medcial\covid19\Normal\images_processed  (10192 files)


100%|██████████| 10192/10192 [17:13<00:00,  9.86it/s]


\nProcessing medcial\covid19\Viral Pneumonia\images  ->  medcial\covid19\Viral Pneumonia\images_processed  (1345 files)


100%|██████████| 1345/1345 [02:07<00:00, 10.55it/s]

\nDone.
Manifest: C:\Users\bacht\Desktop\Master2_S1\ITFFC\Dataset\preprocess_manifest.csv
Errors  : C:\Users\bacht\Desktop\Master2_S1\ITFFC\Dataset\preprocess_errors.csv (may be empty)



