## Ureter Dataset Preprocessing

#### (For relative paths) This notebook is located in a folder, which is in the same folder as the original dataset being preprocessed

Download dataset from -- "https://ieee-dataport.org/documents/ud-ureter-uterine-artery-nerve-dataset"

## Confirming whether the masks only have 3 or less masks (1 for each organ type)

In [None]:
import os
import re
import random
import shutil
import numpy as np
import pandas as pd
from PIL import Image
from math import ceil
from collections import defaultdict

# Define the base directory as the relative path from your notebook location
base_dir = os.path.join("..", "Datasets_BeforePreprocessing", "UD Ureter-Uterine Artery-Nerve Dataset")

# Define paths to the images and masks directories relative to the base directory
images_dir = os.path.join(base_dir, "images")
masks_dir = os.path.join(base_dir, "mask")

# Get list of image file names without extensions
image_files = [os.path.splitext(f)[0] for f in os.listdir(images_dir) if os.path.isfile(os.path.join(images_dir, f))]

# Get list of mask file names without extensions
mask_files = [os.path.splitext(f)[0] for f in os.listdir(masks_dir) if os.path.isfile(os.path.join(masks_dir, f))]

# Initialize a dictionary to store mask counts for each image
mask_counts = defaultdict(int)

# Build a mapping of masks to images
for image_name in image_files:
    # Filter masks that correspond to the current image
    corresponding_masks = [mask for mask in mask_files if mask.startswith(image_name)]
    # Count the number of masks
    mask_counts[image_name] = len(corresponding_masks)

# Convert the results into a DataFrame
df_mask_counts = pd.DataFrame(list(mask_counts.items()), columns=['Image Name', 'Mask Count'])

# Define the path to save the Excel file
output_path = os.path.join(base_dir, "image_mask_counts.xlsx")

# Save the DataFrame to an Excel file
df_mask_counts.to_excel(output_path, index=False)

output_path

'..\\Datasets_BeforePreprocessing\\UD Ureter-Uterine Artery-Nerve Dataset\\image_mask_counts.xlsx'

## Removing masks that have empty labels

In [None]:
# Function to check if a mask has no points (all zeros)
def is_empty_mask(mask_path):
    mask = np.array(Image.open(mask_path))
    return np.all(mask == 0)

# Initialize counter for deleted masks
deleted_mask_count = 0

# Iterate over all masks in the folder
for mask_file in os.listdir(masks_dir):
    mask_path = os.path.join(masks_dir, mask_file)
    if mask_file.endswith(('.png', '.jpg', '.jpeg')) and os.path.isfile(mask_path):
        if is_empty_mask(mask_path):
            os.remove(mask_path)  # Remove the empty mask
            deleted_mask_count += 1

# Output the result
print(f"Total number of empty masks removed: {deleted_mask_count}")


Total number of empty masks removed: 341


## Splitting masks into ureter/artery/nerve folders

In [None]:
# Define output folders for each class
output_dirs = {
    "artery": os.path.join(mask_dir, "artery"),
    "ureter": os.path.join(mask_dir, "ureter"),
    "nerve": os.path.join(mask_dir, "nerve"),
}

# Ensure output directories exist
for class_label, path in output_dirs.items():
    os.makedirs(path, exist_ok=True)

# Move files to their respective folders
for mask_file in os.listdir(mask_dir):
    mask_path = os.path.join(mask_dir, mask_file)
    
    # Check if it is a file and classify it
    if os.path.isfile(mask_path):
        if "arthery" in mask_file:
            shutil.move(mask_path, output_dirs["artery"])
        elif "ureter" in mask_file:
            shutil.move(mask_path, output_dirs["ureter"])
        elif "nerve" in mask_file:
            shutil.move(mask_path, output_dirs["nerve"])

print("Masks have been organized into their respective folders.")

Masks have been organized into their respective folders.


## Removing class identifiers/names of masks

In [None]:
class_dirs = {
    "artery": os.path.join(base_dir, "artery"),
    "ureter": os.path.join(base_dir, "ureter"),
    "nerve": os.path.join(base_dir, "nerve"),
}

# Rename files in artery folder
for filename in os.listdir(class_dirs["artery"]):
    if ".png_arthery" in filename:
        old_path = os.path.join(class_dirs["artery"], filename)
        new_filename = filename.replace(".png_arthery", "")
        new_path = os.path.join(class_dirs["artery"], new_filename)
        os.rename(old_path, new_path)

# Rename files in ureter folder
for filename in os.listdir(class_dirs["ureter"]):
    if ".png_ureter" in filename:
        old_path = os.path.join(class_dirs["ureter"], filename)
        new_filename = filename.replace(".png_ureter", "")
        new_path = os.path.join(class_dirs["ureter"], new_filename)
        os.rename(old_path, new_path)

# Rename files in nerve folder
for filename in os.listdir(class_dirs["nerve"]):
    if re.search(r"\.png_\d{1,3},\d{1,3}_nerve", filename):  # Match patterns like ".png_#,###_nerve" or ".png_0,36_nerve"
        old_path = os.path.join(class_dirs["nerve"], filename)
        new_filename = re.sub(r"\.png_\d{1,3},\d{1,3}_nerve", "", filename)
        new_path = os.path.join(class_dirs["nerve"], new_filename)
        os.rename(old_path, new_path)

print("Renaming completed for all classes.")

Renaming completed for all classes.


## Splitting the nerve, ureter, and uterine artery masks into train/val/test (70/15/15) - also ensuring patient-wise split

In [None]:
# Define base directories
after_base_dir = os.path.join("..", "Datasets_AfterPreprocessing", "UD Ureter-Uterine Artery-Nerve Dataset")

# Source directories for masks
classes = ["nerve", "ureter", "artery"]
source_dirs = {class_label: os.path.join(base_dir, "mask", class_label) for class_label in classes}

# Target directories for splits in the after folder
split_dirs = {
    class_label: {
        "train": os.path.join(after_base_dir, "train", class_label),
        "val": os.path.join(after_base_dir, "val", class_label),
        "test": os.path.join(after_base_dir, "test", class_label),
    }
    for class_label in classes
}

# Ensure split directories exist
for class_label, splits in split_dirs.items():
    for split_dir in splits.values():
        os.makedirs(split_dir, exist_ok=True)

# Function to split masks for a class
def split_masks_by_class(class_label):
    print(f"\nProcessing class: {class_label.capitalize()}")

    # Group masks by patient
    mask_data = defaultdict(list)  # {patient_id: [mask_paths]}
    for mask_file in os.listdir(source_dirs[class_label]):
        if mask_file.endswith('.png'):
            patient_id = mask_file.split('.')[0]  # Extract patient ID (e.g., video_1)
            mask_path = os.path.join(source_dirs[class_label], mask_file)
            mask_data[patient_id].append(mask_path)

    # Shuffle patient IDs
    patients = list(mask_data.keys())
    random.shuffle(patients)

    # Define split ratios
    train_ratio, val_ratio, test_ratio = 0.7, 0.15, 0.15

    # Split patients into train, val, and test
    num_patients = len(patients)
    train_patients = patients[:int(num_patients * train_ratio)]
    val_patients = patients[int(num_patients * train_ratio):int(num_patients * (train_ratio + val_ratio))]
    test_patients = patients[int(num_patients * (train_ratio + val_ratio)):]

    # Print patient distribution
    print("Patient Distribution:")
    print(f"Train Patients: {len(train_patients)} -> {train_patients}")
    print(f"Val Patients: {len(val_patients)} -> {val_patients}")
    print(f"Test Patients: {len(test_patients)} -> {test_patients}")

    # Copy masks into respective split directories
    for split, patient_list in [("train", train_patients), ("val", val_patients), ("test", test_patients)]:
        for patient in patient_list:
            for mask_path in mask_data[patient]:
                destination = split_dirs[class_label][split]
                shutil.copy2(mask_path, destination)

    # Print split summary
    print(f"\nSplit Summary for {class_label.capitalize()}:")
    for split, split_dir in split_dirs[class_label].items():
        num_files = len(os.listdir(split_dir))
        print(f"{split.capitalize()}: {num_files} masks")

# Split masks for each class
for class_label in classes:
    split_masks_by_class(class_label)


Processing class: Nerve
Patient Distribution:
Train Patients: 4 -> ['video_14', 'video_18', 'video_15', 'video_19']
Val Patients: 1 -> ['video_17']
Test Patients: 1 -> ['video_16']

Split Summary for Nerve:
Train: 115 masks
Val: 30 masks
Test: 38 masks

Processing class: Ureter
Patient Distribution:
Train Patients: 8 -> ['video_23', 'video_25', 'video_12', 'video_21', 'video_6', 'video_22', 'video_24', 'video_2']
Val Patients: 2 -> ['video_4', 'video_5']
Test Patients: 2 -> ['video_11', 'video_20']

Split Summary for Ureter:
Train: 165 masks
Val: 35 masks
Test: 54 masks

Processing class: Artery
Patient Distribution:
Train Patients: 15 -> ['video_20', 'video_12', 'video_29', 'video_32', 'video_8', 'video_25', 'video_38', 'video_36', 'video_27', 'video_3', 'video_9', 'video_7', 'video_6', 'video_35', 'video_1']
Val Patients: 3 -> ['video_13', 'video_31', 'video_26']
Test Patients: 4 -> ['video_33', 'video_30', 'video_37', 'video_34']

Split Summary for Artery:
Train: 139 masks
Val: 50 

## Renaming image filenames

In [None]:
# Iterate over all files in the folder
for filename in os.listdir(images_dir):
    old_path = os.path.join(images_dir, filename)

    # Remove the last .png if it exists
    if filename.endswith('.png'):
        new_filename = filename.rsplit('.png', 1)[0]  # Remove only the last .png
    else:
        new_filename = filename

    # Remove patterns like ".png_#,###" if they exist, keeping the ".jpg"
    new_filename = re.sub(r"\.png_\d+,\d+", "", new_filename)  # Matches ".png_<digits>,<digits>"

    # If the name has changed, rename the file
    if new_filename != filename:
        new_path = os.path.join(images_dir, new_filename)
        os.rename(old_path, new_path)
        print(f'Renamed: "{filename}" to "{new_filename}"')

print("All redundant extensions and patterns have been removed.")

Renamed: "video_1.mov_0.024000.png.png" to "video_1.mov_0.024000.png"
Renamed: "video_1.mov_0.320667.png.png" to "video_1.mov_0.320667.png"
Renamed: "video_1.mov_0.466000.png.png" to "video_1.mov_0.466000.png"
Renamed: "video_1.mov_0.483667.png.png" to "video_1.mov_0.483667.png"
Renamed: "video_1.mov_0.508000.png.png" to "video_1.mov_0.508000.png"
Renamed: "video_1.mov_0.697333.png.png" to "video_1.mov_0.697333.png"
Renamed: "video_1.mov_0.740333.png.png" to "video_1.mov_0.740333.png"
Renamed: "video_1.mov_0.848667.png.png" to "video_1.mov_0.848667.png"
Renamed: "video_1.mov_0.877667.png.png" to "video_1.mov_0.877667.png"
Renamed: "video_1.mov_1.020333.png.png" to "video_1.mov_1.020333.png"
Renamed: "video_1.mov_1.198333.png.png" to "video_1.mov_1.198333.png"
Renamed: "video_1.mov_1.300000.png.png" to "video_1.mov_1.300000.png"
Renamed: "video_1.mov_1.535000.png.png" to "video_1.mov_1.535000.png"
Renamed: "video_1.mov_1.759333.png.png" to "video_1.mov_1.759333.png"
Renamed: "video_1.mo

## Moving image files into "Masks" and "Frames" folders in each of train/val/test split folders according to "Masks" splits

In [None]:
# Define train, val, and test folders
mask_splits = ["train", "val", "test"]

# Iterate through each split (train, val, test)
for split in mask_splits:
    split_dir = os.path.join(after_base_dir, split)  # Current split directory
    masks_dir = os.path.join(split_dir, "Masks")  # New Masks directory

    # Ensure the Masks directory exists
    os.makedirs(masks_dir, exist_ok=True)

    # Move class folders (artery, nerve, ureter) into Masks
    for class_folder in ["artery", "nerve", "ureter"]:
        class_path = os.path.join(split_dir, class_folder)
        if os.path.exists(class_path) and os.path.isdir(class_path):
            shutil.move(class_path, masks_dir)  # Move the class folder to Masks
            print(f"Moved: {class_folder} to {masks_dir}")
        else:
            print(f"Warning: {class_folder} not found in {split_dir}")

print("Class folders have been moved into Masks in train/val/test splits.")

Moved: artery to ..\Datasets_AfterPreprocessing\UD Ureter-Uterine Artery-Nerve Dataset\train\Masks
Moved: nerve to ..\Datasets_AfterPreprocessing\UD Ureter-Uterine Artery-Nerve Dataset\train\Masks
Moved: ureter to ..\Datasets_AfterPreprocessing\UD Ureter-Uterine Artery-Nerve Dataset\train\Masks
Moved: artery to ..\Datasets_AfterPreprocessing\UD Ureter-Uterine Artery-Nerve Dataset\val\Masks
Moved: nerve to ..\Datasets_AfterPreprocessing\UD Ureter-Uterine Artery-Nerve Dataset\val\Masks
Moved: ureter to ..\Datasets_AfterPreprocessing\UD Ureter-Uterine Artery-Nerve Dataset\val\Masks
Moved: artery to ..\Datasets_AfterPreprocessing\UD Ureter-Uterine Artery-Nerve Dataset\test\Masks
Moved: nerve to ..\Datasets_AfterPreprocessing\UD Ureter-Uterine Artery-Nerve Dataset\test\Masks
Moved: ureter to ..\Datasets_AfterPreprocessing\UD Ureter-Uterine Artery-Nerve Dataset\test\Masks
Class folders have been moved into Masks in train/val/test splits.


In [None]:
# Target directories for images (Frames) in the AfterPreprocessing splits
image_split_dirs = {split: os.path.join(after_base_dir, split, "Frames") for split in mask_splits}

# Ensure target directories for Frames exist
for split_dir in image_split_dirs.values():
    os.makedirs(split_dir, exist_ok=True)

# Copy images based on mask split filenames
for split in mask_splits:
    mask_dir = os.path.join(after_base_dir, split, "Masks")  # Mask directory
    image_dest_dir = image_split_dirs[split]  # Target directory for Frames

    # Iterate through all mask subdirectories (artery, nerve, ureter)
    for class_folder in os.listdir(mask_dir):
        class_path = os.path.join(mask_dir, class_folder)
        if os.path.isdir(class_path):  # Ensure it's a directory
            for mask_file in os.listdir(class_path):
                # Extract corresponding image filename without changing the extension
                base_name = os.path.splitext(mask_file)[0]
                
                # Check if the image exists with .png or .jpg 
                image_file = None
                for ext in [".png", ".jpg"]:
                    possible_image = base_name + ext
                    image_src = os.path.join(images_dir, possible_image)
                    if os.path.exists(image_src):
                        image_file = possible_image
                        break

                # If the image exists, copy it to the destination folder
                if image_file:
                    image_dest = os.path.join(image_dest_dir, image_file)
                    shutil.copy2(image_src, image_dest)
                    print(f"Copied: {image_file} to {split}/Frames")
                else:
                    print(f"Warning: No matching image found for {mask_file} in {images_dir}")

print("Images have been split into train/val/test Frames based on mask splits.")

Copied: video_1.mov_0.024000.png to train/Frames
Copied: video_1.mov_0.320667.png to train/Frames
Copied: video_1.mov_0.466000.png to train/Frames
Copied: video_1.mov_0.483667.png to train/Frames
Copied: video_1.mov_0.508000.png to train/Frames
Copied: video_1.mov_0.697333.png to train/Frames
Copied: video_1.mov_0.740333.png to train/Frames
Copied: video_1.mov_0.848667.png to train/Frames
Copied: video_1.mov_0.877667.png to train/Frames
Copied: video_1.mov_1.020333.png to train/Frames
Copied: video_1.mov_1.198333.png to train/Frames
Copied: video_1.mov_1.300000.png to train/Frames
Copied: video_1.mov_1.535000.png to train/Frames
Copied: video_1.mov_1.759333.png to train/Frames
Copied: video_1.mov_1.788667.png to train/Frames
Copied: video_1.mov_10.017000.png to train/Frames
Copied: video_1.mov_10.127667.png to train/Frames
Copied: video_1.mov_10.177000.png to train/Frames
Copied: video_1.mov_10.220333.png to train/Frames
Copied: video_1.mov_10.310667.png to train/Frames
Copied: video_1

## Converting np.false and np.true in masks into (0) and (255)

In [None]:
# Function to convert masks to {0, 255} format
def convert_masks_to_uint8(mask_folder):
    for filename in os.listdir(mask_folder):
        mask_path = os.path.join(mask_folder, filename)
        if mask_path.endswith(('.png', '.jpg', '.jpeg')):  # Process only image files
            try:
                # Load mask
                mask_array = np.array(Image.open(mask_path))

                # Check if mask is boolean and convert to {0, 255}
                if mask_array.dtype == np.bool_ or np.array_equal(np.unique(mask_array), [False, True]):
                    mask_array = (mask_array * 255).astype(np.uint8)

                # Ensure masks already in uint8 are untouched
                elif mask_array.dtype == np.uint8 and set(np.unique(mask_array)) <= {0, 255}:
                    pass  # Mask is already in the correct format
                else:
                    # Normalize other values to {0, 255}
                    mask_array = ((mask_array > 0) * 255).astype(np.uint8)

                # Save the updated mask
                Image.fromarray(mask_array).save(mask_path)
                print(f"Converted: {mask_path}")

            except Exception as e:
                print(f"Error processing {mask_path}: {e}")

# Process masks in train, val, and test folders
for split in mask_splits:
    split_dir = os.path.join(after_base_dir, split, "Masks")
    if os.path.exists(split_dir):
        print(f"Processing {split} Masks...")
        for root, dirs, files in os.walk(split_dir):
            for dir_name in dirs:
                mask_folder = os.path.join(root, dir_name)
                convert_masks_to_uint8(mask_folder)
    else:
        print(f"'{split}' folder not found in the dataset.")

print("All masks have been standardized to {0, 255} in train, val, and test folders.")

Processing train Masks...
Converted: ..\Datasets_AfterPreprocessing\UD Ureter-Uterine Artery-Nerve Dataset\train\Masks\artery\video_1.mov_0.024000.png
Converted: ..\Datasets_AfterPreprocessing\UD Ureter-Uterine Artery-Nerve Dataset\train\Masks\artery\video_1.mov_0.320667.png
Converted: ..\Datasets_AfterPreprocessing\UD Ureter-Uterine Artery-Nerve Dataset\train\Masks\artery\video_1.mov_0.466000.png
Converted: ..\Datasets_AfterPreprocessing\UD Ureter-Uterine Artery-Nerve Dataset\train\Masks\artery\video_1.mov_0.483667.png
Converted: ..\Datasets_AfterPreprocessing\UD Ureter-Uterine Artery-Nerve Dataset\train\Masks\artery\video_1.mov_0.508000.png
Converted: ..\Datasets_AfterPreprocessing\UD Ureter-Uterine Artery-Nerve Dataset\train\Masks\artery\video_1.mov_0.697333.png
Converted: ..\Datasets_AfterPreprocessing\UD Ureter-Uterine Artery-Nerve Dataset\train\Masks\artery\video_1.mov_0.740333.png
Converted: ..\Datasets_AfterPreprocessing\UD Ureter-Uterine Artery-Nerve Dataset\train\Masks\artery

In [None]:
sub_folders = ["Frames", os.path.join("Masks", "ureter"), os.path.join("Masks", "artery"), os.path.join("Masks", "nerve")]

# Updated function to rename files with a more detailed pattern: "##_#_######"
def rename_files_detailed_pattern(images_dir):
    for root, _, files in os.walk(images_dir):
        for file_name in files:
            # Match the specific pattern in the filename
            match = re.search(r"video_(\d+)\.\w+_(\d+)\.(\d+)", file_name)
            if match:
                # Extract video number, main timestamp, and fractional timestamp
                video_number = match.group(1)
                main_timestamp = match.group(2)
                fractional_timestamp = match.group(3)

                # Construct the new name as "##_#_######.png"
                new_name = f"{video_number}_{main_timestamp}_{fractional_timestamp}.png"
                old_path = os.path.join(root, file_name)
                new_path = os.path.join(root, new_name)

                # Rename the file
                os.rename(old_path, new_path)
                print(f'Renamed: "{old_path}" to "{new_path}"')

# Iterate over each split and sub-folder to process the renaming
for split in mask_splits:
    for sub_folder in sub_folders:
        images_dir = os.path.join(after_base_dir, split, sub_folder)
        if os.path.exists(images_dir):
            rename_files_detailed_pattern(images_dir)
        else:
            print(f"Folder not found: {images_dir}")

print("Renaming completed with detailed pattern.")

Renamed: "..\Datasets\UD Ureter-Uterine Artery-Nerve Dataset\train\Frames\video_1.mov_0.024000.png" to "..\Datasets\UD Ureter-Uterine Artery-Nerve Dataset\train\Frames\1_0_024000.png"
Renamed: "..\Datasets\UD Ureter-Uterine Artery-Nerve Dataset\train\Frames\video_1.mov_0.320667.png" to "..\Datasets\UD Ureter-Uterine Artery-Nerve Dataset\train\Frames\1_0_320667.png"
Renamed: "..\Datasets\UD Ureter-Uterine Artery-Nerve Dataset\train\Frames\video_1.mov_0.466000.png" to "..\Datasets\UD Ureter-Uterine Artery-Nerve Dataset\train\Frames\1_0_466000.png"
Renamed: "..\Datasets\UD Ureter-Uterine Artery-Nerve Dataset\train\Frames\video_1.mov_0.483667.png" to "..\Datasets\UD Ureter-Uterine Artery-Nerve Dataset\train\Frames\1_0_483667.png"
Renamed: "..\Datasets\UD Ureter-Uterine Artery-Nerve Dataset\train\Frames\video_1.mov_0.508000.png" to "..\Datasets\UD Ureter-Uterine Artery-Nerve Dataset\train\Frames\1_0_508000.png"
Renamed: "..\Datasets\UD Ureter-Uterine Artery-Nerve Dataset\train\Frames\video_