In [31]:
import os
import sys
from pathlib import Path
import numpy as np

# Resolve project root without __file__ (works inside notebooks)
cwd = Path.cwd().resolve()
if (cwd / "src").is_dir():
    project_root = cwd
elif (cwd.parent / "src").is_dir():
    project_root = cwd.parent
else:
    # Fallback: assume we are in notebooks/ and project root is one level up
    project_root = cwd.parent

if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

from src.data_preprocessing import preprocess_pipeline, IMAGE_EXTENSIONS

In [33]:
# Paths
from pathlib import Path

# Reuse project_root from previous cell; fallback if missing
try:
    project_root
except NameError:
    cwd = Path.cwd().resolve()
    if (cwd / "src").is_dir():
        project_root = cwd
    elif (cwd.parent / "src").is_dir():
        project_root = cwd.parent
    else:
        project_root = cwd.parent

source_dir = project_root / "data" / "raw" / "breast-cancer-padded-interpolated-720p"
processed_dir = project_root / "data" / "processed"
processed_dir.mkdir(parents=True, exist_ok=True)

if not source_dir.is_dir():
    raise FileNotFoundError(
        f"Expected extracted dataset at {source_dir}. Please download/extract it before running this notebook."
    )

In [34]:
# Process each class subfolder
target_size = (224, 224)

def is_image_file(path: Path) -> bool:
    return path.suffix.lower() in IMAGE_EXTENSIONS

if not source_dir.is_dir():
    raise FileNotFoundError(f"Source directory not found: {source_dir}")

for subfolder in sorted(p for p in source_dir.iterdir() if p.is_dir()):
    processed_subfolder = processed_dir / subfolder.name
    processed_subfolder.mkdir(parents=True, exist_ok=True)

    image_paths = sorted(p for p in subfolder.iterdir() if p.is_file() and is_image_file(p))
    if not image_paths:
        print(f"No images found in {subfolder}; skipping.")
        continue

    # Apply pipeline to these images
    processed_images = preprocess_pipeline([str(p) for p in image_paths], target_size=target_size)

    # Save each processed image as .npy (same ordering as image_paths)
    for img_array, img_path in zip(processed_images, image_paths):
        save_path = processed_subfolder / (img_path.stem + ".npy")
        np.save(save_path, img_array)

    print(f"Processed {len(processed_images)} images in folder {subfolder.name}")

Processed 324 images in folder 0_N
Processed 590 images in folder 1_PB
Processed 590 images in folder 1_PB
Processed 374 images in folder 2_UDH
Processed 374 images in folder 2_UDH
Processed 521 images in folder 3_FEA
Processed 521 images in folder 3_FEA
Processed 367 images in folder 4_ADH
Processed 367 images in folder 4_ADH
Processed 529 images in folder 5_DCIS
Processed 529 images in folder 5_DCIS
Processed 473 images in folder 6_IC
Processed 473 images in folder 6_IC
