In [1]:
from PIL import Image
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
import os
import random
import shutil

In [2]:
# Verifies whether resizing or padding is required

folder = "/Users/sasha/Documents/fetal-tumor-segmentation/data/raw/train"  
image_sizes = []

for filename in os.listdir(folder):     # os.listdir(folder) returns a list of all the file names inside folder

    if filename.lower().endswith((".png")):

        file_path = os.path.join(folder, filename)
        with Image.open(file_path) as image:     # Ensures the image file is automatically closed after use
            image_sizes.append(image.size)     # (width, height)
            
size_frequency = Counter(image_sizes)     # (width, height) to frequency mapping for every unique size
print("Train set ->", size_frequency)

folder = "/Users/sasha/Documents/fetal-tumor-segmentation/data/raw/test"
image_sizes = []

for filename in os.listdir(folder):

    if filename.lower().endswith((".png")):

        file_path = os.path.join(folder, filename)
        with Image.open(file_path) as image:
            image_sizes.append(image.size)
            
size_frequency = Counter(image_sizes)
print("Test set ->", size_frequency)

folder = "/Users/sasha/Documents/fetal-tumor-segmentation/data/raw/valid"
image_sizes = []

for filename in os.listdir(folder):

    if filename.lower().endswith((".png")):

        file_path = os.path.join(folder, filename)
        with Image.open(file_path) as image:
            image_sizes.append(image.size)
            
size_frequency = Counter(image_sizes)
print("Valid set ->", size_frequency)

Train set -> Counter({(800, 540): 1405, (800, 542): 8, (783, 541): 2, (738, 541): 2, (797, 541): 2, (780, 544): 2, (788, 545): 2, (796, 542): 2, (782, 542): 2, (790, 539): 2, (796, 544): 2, (780, 539): 2, (789, 540): 2, (798, 541): 1, (799, 563): 1, (789, 545): 1, (791, 544): 1, (794, 544): 1, (783, 543): 1, (786, 542): 1, (794, 543): 1})
Test set -> Counter({(800, 540): 295, (800, 542): 2, (799, 563): 1, (789, 545): 1, (791, 544): 1})
Valid set -> Counter({(800, 540): 250, (786, 542): 1, (798, 541): 1, (794, 543): 1, (794, 544): 1, (783, 543): 1})


In [3]:
root = "/Users/sasha/Documents/fetal-tumor-segmentation"     # Projet directory

data_dir = os.path.join(root, "data")

raw_dirs = [
    "/Users/sasha/Documents/fetal-tumor-segmentation/data/raw/train",
    "/Users/sasha/Documents/fetal-tumor-segmentation/data/raw/test",
    "/Users/sasha/Documents/fetal-tumor-segmentation/data/raw/valid"
]

normalized_dir = os.path.join(data_dir, "normalized")
norm_img_dir = os.path.join(normalized_dir, "images")
norm_mask_dir = os.path.join(normalized_dir, "masks")

final_dir = os.path.join(data_dir, "dataset")

# Target image resolution
target_w = 800
target_h = 544
mask_suffix = "_Annotation.png"     # Suffix used to identify segmentation mask files

In [4]:
def make_dir(path):

    os.makedirs(path, exist_ok=True)     # Create directory (safe if it already exists)

def crop_or_pad(pil_img, target_w, target_h):
    # Resize an image to the target size by center-cropping or zero-padding

    w, h = pil_img.size     # Current image width and height

    if w > target_w:     # Center-crop horizontally
        left = (w - target_w) // 2
        pil_img = pil_img.crop((left, 0, left + target_w, h))
        w = target_w

    if h > target_h:     # Center-crop vertically
        top = (h - target_h) // 2
        pil_img = pil_img.crop((0, top, w, top + target_h))
        h = target_h

    # Compute required padding
    pad_w = target_w - w
    pad_h = target_h - h

    # Apply symmetric zero-padding
    if pad_w > 0 or pad_h > 0:
        left = pad_w // 2
        top  = pad_h // 2
        padded = Image.new("L", (target_w, target_h), color=0)
        padded.paste(pil_img, (left, top))
        pil_img = padded

    return pil_img     # Return padded image

def normalize():

    """
    Normalize raw images and masks:
    - Match image-mask pairs using filename prefixes
    - Ensure size consistency
    - Crop or pad to a fixed resolution
    - Save processed outputs to normalized directories
    """
    
    # Create output directories
    make_dir(norm_img_dir)
    make_dir(norm_mask_dir)

    # Track prefixes that do not yet have both image and mask
    unresolved = set()

    # File-name (prefix) to path mapping
    seen_images = {}
    seen_masks = {}

    for root in raw_dirs:

        for fname in os.listdir(root):

            prefix = fname.replace(mask_suffix, "").replace(".png", "")     # Extract prefix by removing mask suffix and extension
            path = os.path.join(root, fname)

            # Categorize file as image or mask
            if fname.endswith(mask_suffix):
                seen_masks[prefix] = path
            else:
                seen_images[prefix] = path

            # Process both image and mask (if both available)
            if prefix in seen_images and prefix in seen_masks:

                # Convert image to grayscale
                img = Image.open(seen_images[prefix]).convert("L")     # "L" = Luminance (1 channel, 8-bit)
                mask = Image.open(seen_masks[prefix]).convert("L")

                if img.size != mask.size:     # Raise error if size doesn't match
                    raise ValueError(f"Shape mismatch: {prefix}")

                # Crop or pad
                img_final  = crop_or_pad(img, target_w, target_h)
                mask_final = crop_or_pad(mask, target_w, target_h)

                # Save 
                img_final.save(os.path.join(norm_img_dir, prefix + ".png"))
                mask_final.save(os.path.join(norm_mask_dir, prefix + mask_suffix))

                # Pair resolved
                unresolved.discard(prefix)

            else:

                unresolved.add(prefix)

    # Print any unmatched imageâ€“mask pairs (should be empty)
    print("Unresolved prefixes:", unresolved)

In [5]:
normalize()

Unresolved prefixes: set()


In [6]:
# Sanity check: veriify that all normalized images are 800x544

folder = "/Users/sasha/Documents/fetal-tumor-segmentation/data/normalized/images"
image_sizes = []

for filename in os.listdir(folder):

    if filename.lower().endswith((".png")):

        file_path = os.path.join(folder, filename)
        with Image.open(file_path) as image:
            image_sizes.append(image.size)
            
size_frequency = Counter(image_sizes)
print("Images (norm) ->", size_frequency)

folder = "/Users/sasha/Documents/fetal-tumor-segmentation/data/normalized/masks"
image_sizes = []

for filename in os.listdir(folder):

    if filename.lower().endswith((".png")):

        file_path = os.path.join(folder, filename)
        with Image.open(file_path) as image:
            image_sizes.append(image.size)
            
size_frequency = Counter(image_sizes)
print("Masks (norm) ->", size_frequency)

Images (norm) -> Counter({(800, 544): 999})
Masks (norm) -> Counter({(800, 544): 999})


In [7]:
# All normalized image and mask file name
imgs = set(os.listdir(norm_img_dir))
masks = set(os.listdir(norm_mask_dir))

# Extract file-name prefixes from images and masks
# Images: remove ".png"
# Masks : remove the mask suffix (e.g., "_Annotation.png")
img_prefixes  = {f.replace(".png", "") for f in imgs}
mask_prefixes = {f.replace(mask_suffix, "") for f in masks}

# Set difference
print("Missing images:", mask_prefixes - img_prefixes)
print("Missing masks:", img_prefixes - mask_prefixes)

Missing images: set()
Missing masks: set()


In [8]:
random.seed(42)     # Set seed for reproducibility

# Proportions
splits = {
    "train": 0.8,
    "valid": 0.1,
    "test":  0.1
}

In [9]:
def split_dataset():

    prefixes = [f.replace(".png", "") for f in os.listdir(norm_img_dir)]     # Extract the prefixes

    random.shuffle(prefixes)     # Randomly shuffle prefixes to ensure unbiased splitting

    n = len(prefixes)     # Number of samples
    
    t1 = int(n * splits["train"])     # End index of training set
    t2 = t1 + int(n * splits["valid"])

    # Split name to list of sample IDs mapping
    split_map = {
        "train": prefixes[:t1],
        "valid": prefixes[t1:t2],
        "test":  prefixes[t2:]
    }

    for split, sample_ids in split_map.items():

        # Output directories
        img_out  = os.path.join(final_dir, split, "images")
        mask_out = os.path.join(final_dir, split, "masks")

        make_dir(img_out)
        make_dir(mask_out)

        for sample_id in sample_ids:

            # Copy image 
            shutil.copy(
                os.path.join(norm_img_dir, sample_id + ".png"),
                os.path.join(img_out, sample_id + ".png")
            )

            # Copy mask
            shutil.copy(
                os.path.join(norm_mask_dir, sample_id + mask_suffix),
                os.path.join(mask_out, sample_id + mask_suffix)
            )

    print("Split completed.")

In [10]:
split_dataset()

Split completed.


In [11]:
# Sanity check

dataset_dirs = {
    "Train (imgs)": "/Users/sasha/Documents/fetal-tumor-segmentation/data/dataset/train/images",
    "Train (masks)": "/Users/sasha/Documents/fetal-tumor-segmentation/data/dataset/train/masks",
    "Test (imgs)":  "/Users/sasha/Documents/fetal-tumor-segmentation/data/dataset/test/images",
    "Test (masks)":  "/Users/sasha/Documents/fetal-tumor-segmentation/data/dataset/test/masks",
    "Valid (imgs)": "/Users/sasha/Documents/fetal-tumor-segmentation/data/dataset/valid/images",
    "Valid (masks)": "/Users/sasha/Documents/fetal-tumor-segmentation/data/dataset/valid/masks",
}

for split_name, folder in dataset_dirs.items():
    image_sizes = []

    for filename in os.listdir(folder):
        if filename.lower().endswith(".png"):
            file_path = os.path.join(folder, filename)
            with Image.open(file_path) as image:
                image_sizes.append(image.size)

    size_frequency = Counter(image_sizes)

    print(f"{split_name} set -> {size_frequency}")

Train (imgs) set -> Counter({(800, 544): 799})
Train (masks) set -> Counter({(800, 544): 799})
Test (imgs) set -> Counter({(800, 544): 101})
Test (masks) set -> Counter({(800, 544): 101})
Valid (imgs) set -> Counter({(800, 544): 99})
Valid (masks) set -> Counter({(800, 544): 99})


In [12]:
# Expected image shape in NumPy array format: (height, width)
# PIL returns image size in (width, height) format,
# Since images are 800x544 in PIL, the expected NumPy shape is (544, 800) 
expected_shape = (544, 800)

img_ext = ".png"
mask_ext = ".png"

dataset_dirs = {
    "train": {
        "images": "/Users/sasha/Documents/fetal-tumor-segmentation/data/dataset/train/images",
        "masks":  "/Users/sasha/Documents/fetal-tumor-segmentation/data/dataset/train/masks",
    },
    "valid": {
        "images": "/Users/sasha/Documents/fetal-tumor-segmentation/data/dataset/valid/images",
        "masks":  "/Users/sasha/Documents/fetal-tumor-segmentation/data/dataset/valid/masks",
    },
    "test": {
        "images": "/Users/sasha/Documents/fetal-tumor-segmentation/data/dataset/test/images",
        "masks":  "/Users/sasha/Documents/fetal-tumor-segmentation/data/dataset/test/masks",
    },
}

In [13]:
def load_grayscale(path):

    img = Image.open(path).convert("L")
    return np.array(img)     # Convert PIL image to a NumPy array

def normalize_image(img):

    return img.astype(np.float32) / 255.0     # Convert to float and scale pixel intensities to [0, 1]

def binarize_mask(mask):     # Binarize mask by treating all non-zero pixels as foreground (1)

    return (mask > 0).astype(np.uint8)     # uint8 = unsigned 8-bit integer

In [14]:
def verify_split(split_name, img_dir, mask_dir):

    print("Verifying split:", split_name)

    # Collect image and mask filenames
    img_files = sorted([f for f in os.listdir(img_dir) if f.endswith(img_ext)])
    mask_files = set([f for f in os.listdir(mask_dir) if f.endswith(mask_ext)])

    print("Images:", len(img_files))
    print("Masks :", len(mask_files))

    # Containers to track data issues
    empty_masks = []     # Masks with no foreground pixels
    border_centroids = []     # Masks whose centroid lies near image borders
    intensity_diffs = []     # Intensity contrast between tumor and background

    for i, img_name in enumerate(img_files):

        base_name = img_name.replace(img_ext, "")
        expected_mask_name = base_name + "_Annotation" + mask_ext

        if expected_mask_name not in mask_files:
            print("Missing mask for image:", img_name)
            continue

        img_path = os.path.join(img_dir, img_name)
        mask_path = os.path.join(mask_dir, expected_mask_name)

        img = load_grayscale(img_path)
        mask = load_grayscale(mask_path)

        if img.shape != expected_shape:
            print("Image shape mismatch:", img_name, img.shape)
            continue

        if mask.shape != expected_shape:
            print("Mask shape mismatch:", expected_mask_name, mask.shape)
            continue

        img = normalize_image(img)
        mask = binarize_mask(mask)

        # Check for empty masks (no foreground pixels)
        if mask.sum() == 0:
            empty_masks.append(img_name)
            continue

        # Compute centroid of the tumor region
        ys, xs = np.where(mask == 1)     # Returns coordinates of all foreground pixels
        cy, cx = ys.mean(), xs.mean()     # Center of mass of the tumor

        h, w = mask.shape
        if cy < 0.05 * h or cy > 0.95 * h or cx < 0.05 * w or cx > 0.95 * w:     # 0.05h <= cy <= 0.95h and 0.05w <= cx <= 0.95w
            border_centroids.append(img_name)

        # Use the mask to select regions in the ultrasound image, extract the pixel intensities from those regions, 
        # and then compute their mean values
        tumor_pixels = img[mask == 1]
        bg_pixels = img[mask == 0]

        if tumor_pixels.size > 0 and bg_pixels.size > 0:
            intensity_diffs.append(abs(tumor_pixels.mean() - bg_pixels.mean()))

        if (i + 1) % 100 == 0:
            print("Checked", i + 1, "samples")

    print("Summary:")
    print("Total samples     :", len(img_files))
    print("Empty masks       :", len(empty_masks))
    print("Border centroids  :", len(border_centroids))

    if intensity_diffs:
        print("Mean intensity diff:", round(np.mean(intensity_diffs), 4))
        print()
    else:
        print("Mean intensity diff: n/a")

    return img_files

In [15]:
# Split to file mapping
split_files = {}

for split_name in dataset_dirs:

    paths = dataset_dirs[split_name]
    img_files = verify_split(
        split_name,
        paths["images"],
        paths["masks"]
    )
    
    split_files[split_name] = img_files

Verifying split: train
Images: 799
Masks : 799
Checked 100 samples
Checked 200 samples
Checked 300 samples
Checked 400 samples
Checked 500 samples
Checked 600 samples
Checked 700 samples
Summary:
Total samples     : 799
Empty masks       : 0
Border centroids  : 0
Mean intensity diff: 0.2253

Verifying split: valid
Images: 99
Masks : 99
Summary:
Total samples     : 99
Empty masks       : 0
Border centroids  : 0
Mean intensity diff: 0.21

Verifying split: test
Images: 101
Masks : 101
Checked 100 samples
Summary:
Total samples     : 101
Empty masks       : 0
Border centroids  : 0
Mean intensity diff: 0.2219

