#### (For relative paths) This notebook is located in a folder, which is in the same folder as the original dataset being preprocessed

Download dataset from -- "https://github.com/CAMMA-public/Endoscapes?tab=readme-ov-file"

## Endoscapes dataset preprocessing of masks

In [None]:
import os
import numpy as np
from PIL import Image
from collections import defaultdict
import re
import glob
import shutil

## Removing unnecesary files and those not related to segmentation

In [None]:
# Define base directories with relative paths
base_before_folder = os.path.join("..", "Datasets_BeforePreprocessing", "endoscapes")
base_after_folder = os.path.join("..", "Datasets_AfterPreprocessing", "Endoscapes")

# List of folders to delete within the base directory
folders_to_delete = ["12_5", "25", "val_seg", "test_seg", "semseg"]

# Loop through each folder and delete it
for folder_name in folders_to_delete:
    folder_path = os.path.join(base_before_folder, folder_name)
    
    # Check if the folder exists
    if os.path.exists(folder_path) and os.path.isdir(folder_path):
        shutil.rmtree(folder_path)  # Permanently delete the folder and its contents
        print(f"Deleted: {folder_path}")
    else:
        print(f"Folder does not exist: {folder_path}")

print("Specified folders have been permanently deleted.")


Deleted: ..\Datasets_BeforePreprocessing\endoscapes\12_5
Deleted: ..\Datasets_BeforePreprocessing\endoscapes\25
Deleted: ..\Datasets_BeforePreprocessing\endoscapes\val_seg
Deleted: ..\Datasets_BeforePreprocessing\endoscapes\test_seg
Deleted: ..\Datasets_BeforePreprocessing\endoscapes\semseg
Specified folders have been permanently deleted.


## Keeping only the jpg files (in folders train, val, and test) that have masks in "insseg"

In [None]:
# Define source folders for images and destination folders for Frames
source_folders = [
    os.path.join(base_before_folder, "val"),
    os.path.join(base_before_folder, "train"),
    os.path.join(base_before_folder, "test")
]
insseg_folder = os.path.join(base_before_folder, "insseg")
frames_folder = os.path.join(base_after_folder, "Frames")

# Ensure the output directories exist
os.makedirs(frames_folder, exist_ok=True)

# Gather the names of .npy files (without the extension) from insseg
matching_image_names = set()
for file in os.listdir(insseg_folder):
    if file.endswith(".npy"):
        matching_image_names.add(os.path.splitext(file)[0])

# Copy only matching images from source folders to the Frames folder
for source_folder in source_folders:
    for image_file in os.listdir(source_folder):
        image_name = os.path.splitext(image_file)[0]  # Get the file name without extension
        if image_name in matching_image_names:
            source_path = os.path.join(source_folder, image_file)
            dest_path = os.path.join(frames_folder, image_file)
            shutil.copy2(source_path, dest_path)
            print(f"Copied: {image_file} from {source_folder} to Frames")

print("Matching images copied to Frames folder.")


Copied: 126_11550.jpg from ..\Datasets_BeforePreprocessing\endoscapes\val to Frames
Copied: 126_12300.jpg from ..\Datasets_BeforePreprocessing\endoscapes\val to Frames
Copied: 126_13050.jpg from ..\Datasets_BeforePreprocessing\endoscapes\val to Frames
Copied: 126_13800.jpg from ..\Datasets_BeforePreprocessing\endoscapes\val to Frames
Copied: 131_40375.jpg from ..\Datasets_BeforePreprocessing\endoscapes\val to Frames
Copied: 131_41125.jpg from ..\Datasets_BeforePreprocessing\endoscapes\val to Frames
Copied: 131_41875.jpg from ..\Datasets_BeforePreprocessing\endoscapes\val to Frames
Copied: 131_42625.jpg from ..\Datasets_BeforePreprocessing\endoscapes\val to Frames
Copied: 131_43375.jpg from ..\Datasets_BeforePreprocessing\endoscapes\val to Frames
Copied: 131_44125.jpg from ..\Datasets_BeforePreprocessing\endoscapes\val to Frames
Copied: 131_44875.jpg from ..\Datasets_BeforePreprocessing\endoscapes\val to Frames
Copied: 131_45625.jpg from ..\Datasets_BeforePreprocessing\endoscapes\val to

## Checking shapes of all .npy files

In [None]:
# Define the base folder for the .npy files using relative paths
base_folder = os.path.join("..", "Datasets_BeforePreprocessing", "endoscapes", "insseg")

# Loop through each .npy file and print its first dimension
for filename in os.listdir(base_folder):
    if filename.endswith(".npy"):
        file_path = os.path.join(base_folder, filename)
        array = np.load(file_path)  # Load the .npy file as a numpy array
        first_dimension = array.shape[0] if array.ndim > 0 else "N/A"  # Get the first dimension
        print(f"{filename}: First dimension = {first_dimension}")


100_27925.npy: First dimension = 6
100_28675.npy: First dimension = 6
100_29425.npy: First dimension = 6
100_30175.npy: First dimension = 6
100_30925.npy: First dimension = 6
100_31675.npy: First dimension = 6
100_32425.npy: First dimension = 7
100_33175.npy: First dimension = 2
100_33925.npy: First dimension = 5
100_34675.npy: First dimension = 7
100_35425.npy: First dimension = 6
100_36175.npy: First dimension = 4
100_36925.npy: First dimension = 5
102_32750.npy: First dimension = 5
102_33500.npy: First dimension = 5
102_34250.npy: First dimension = 6
102_35000.npy: First dimension = 7
102_35750.npy: First dimension = 6
105_15700.npy: First dimension = 7
105_16450.npy: First dimension = 6
105_17200.npy: First dimension = 7
105_17950.npy: First dimension = 6
105_18700.npy: First dimension = 7
105_19450.npy: First dimension = 4
105_20200.npy: First dimension = 5
105_20950.npy: First dimension = 7
105_21700.npy: First dimension = 7
105_22450.npy: First dimension = 4
105_23200.npy: First

## Splitting masks into individual organ/instrument masks from the numpy file (.npy)

In [None]:
# Define paths
base_before_folder_sub = os.path.join("..", "Datasets_BeforePreprocessing", "endoscapes", "insseg")
base_after_folder_sub = os.path.join("..", "Datasets_AfterPreprocessing", "Endoscapes", "Masks")

# Create the Masks directory if it does not exist
os.makedirs(base_after_folder_sub, exist_ok=True)

# Iterate over all .npy files in the insseg directory
for filename in os.listdir(base_before_folder_sub):
    if filename.endswith(".npy"):
        base_name = os.path.splitext(filename)[0]  # Get the base name without extension
        npy_path = os.path.join(base_before_folder_sub, filename)
        csv_path = os.path.join(base_before_folder_sub, f"{base_name}.csv")
        
        # Check if the corresponding CSV file exists
        if not os.path.exists(csv_path):
            print(f"CSV file not found for {filename}. Skipping.")
            continue
        
        # Load the npy file and check if it contains data
        masks = np.load(npy_path)
        
        # Skip processing if the masks array is empty
        if masks.size == 0:
            print(f"No data in {filename}. Skipping.")
            continue

        # Ensure masks are binary (if they contain 0 and 1, scale to 0 and 255)
        if masks.max() == 1:
            masks = masks * 255  # Scale binary masks to 0 and 255

        # Load the CSV data
        csv_data = pd.read_csv(csv_path, header=None)
        
        # Dictionary to track counts of each identifier
        identifier_counts = {}

        # Get the minimum of the mask array's first dimension and the CSV rows
        num_csv_rows = len(csv_data)
        num_masks = masks.shape[0]

        # Iterate through each mask and save it based on the CSV guidance
        for i in range(num_masks):
            if i < num_csv_rows:
                # Use the identifier from the CSV if available
                identifier = str(csv_data.iloc[i, 0]).zfill(2)
                
                # Increment the count for this identifier
                if identifier not in identifier_counts:
                    identifier_counts[identifier] = 1
                else:
                    identifier_counts[identifier] += 1

                # Append instance count if there are duplicates
                instance_suffix = f"_{str(identifier_counts[identifier]).zfill(2)}"
                identifier_with_suffix = f"{identifier}{instance_suffix}"
            else:
                # Use sequential numbering for additional masks
                identifier_with_suffix = f"{i + 1}.0"

            # Create a separate directory for each unique identifier
            mask_folder = os.path.join(base_after_folder_sub, identifier_with_suffix)
            os.makedirs(mask_folder, exist_ok=True)

            # Save the mask with the specified naming convention
            mask_filename = f"{base_name}_{identifier_with_suffix}.png"
            mask_path = os.path.join(mask_folder, mask_filename)
            cv2.imwrite(mask_path, masks[i])
            print(f"Saved {mask_filename} in {mask_folder}")

print("Processing complete.")


Saved 100_27925_2.0_01.png in ..\Datasets_AfterPreprocessing\Endoscapes\Masks\2.0_01
Saved 100_27925_3.0_01.png in ..\Datasets_AfterPreprocessing\Endoscapes\Masks\3.0_01
Saved 100_27925_4.0_01.png in ..\Datasets_AfterPreprocessing\Endoscapes\Masks\4.0_01
Saved 100_27925_5.0_01.png in ..\Datasets_AfterPreprocessing\Endoscapes\Masks\5.0_01
Saved 100_27925_6.0_01.png in ..\Datasets_AfterPreprocessing\Endoscapes\Masks\6.0_01
Saved 100_27925_6.0_02.png in ..\Datasets_AfterPreprocessing\Endoscapes\Masks\6.0_02
Saved 100_28675_2.0_01.png in ..\Datasets_AfterPreprocessing\Endoscapes\Masks\2.0_01
Saved 100_28675_3.0_01.png in ..\Datasets_AfterPreprocessing\Endoscapes\Masks\3.0_01
Saved 100_28675_4.0_01.png in ..\Datasets_AfterPreprocessing\Endoscapes\Masks\4.0_01
Saved 100_28675_5.0_01.png in ..\Datasets_AfterPreprocessing\Endoscapes\Masks\5.0_01
Saved 100_28675_6.0_01.png in ..\Datasets_AfterPreprocessing\Endoscapes\Masks\6.0_01
Saved 100_28675_6.0_02.png in ..\Datasets_AfterPreprocessing\Endo

## Consolidating instrument masks

In [None]:
# Define the base folder for Masks directory and the target folder for the combined mask
base_folder = os.path.join("..", "Datasets_AfterPreprocessing", "Endoscapes", "Masks")
target_folder = os.path.join(base_folder, "6.0")
os.makedirs(target_folder, exist_ok=True)

# Define the folders with masks to be combined
folders_to_combine = ["6.0_01", "6.0_02", "6.0_03"]

# Get a unique list of base names from all specified folders
base_names = set()
for folder in folders_to_combine:
    folder_path = os.path.join(base_folder, folder)
    if os.path.exists(folder_path):
        base_names.update(filename.rsplit('_', 2)[0] for filename in os.listdir(folder_path) if filename.endswith(".png"))

# Process each unique base name
for base_name in base_names:
    combined_mask = None  # Initialize combined mask to None for each new base name

    # Attempt to combine masks across folders
    for folder in folders_to_combine:
        # Construct the expected mask file path
        mask_path = os.path.join(base_folder, folder, f"{base_name}_{folder}.png")

        # Check if the mask file exists in the current folder
        if os.path.exists(mask_path):
            mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)

            # Initialize combined_mask on the first successful mask load
            if combined_mask is None:
                combined_mask = np.zeros_like(mask)

            # Combine masks by preserving 255 values
            combined_mask = cv2.bitwise_or(combined_mask, mask)

    # Save the combined mask if any part was found and combined
    if combined_mask is not None and combined_mask.size > 0:
        combined_mask_path = os.path.join(target_folder, f"{base_name}_6.0.png")
        cv2.imwrite(combined_mask_path, combined_mask)
        print(f"Saved combined mask: {combined_mask_path}")
    else:
        print(f"No valid masks found for {base_name}. Skipping save.")

print("Mask combination complete.")


Saved combined mask: ..\Datasets_AfterPreprocessing\Endoscapes\Masks\6.0_combined\82_23350_6.0.png
Saved combined mask: ..\Datasets_AfterPreprocessing\Endoscapes\Masks\6.0_combined\116_33825_6.0.png
Saved combined mask: ..\Datasets_AfterPreprocessing\Endoscapes\Masks\6.0_combined\100_30175_6.0.png
Saved combined mask: ..\Datasets_AfterPreprocessing\Endoscapes\Masks\6.0_combined\119_86250_6.0.png
Saved combined mask: ..\Datasets_AfterPreprocessing\Endoscapes\Masks\6.0_combined\119_89250_6.0.png
Saved combined mask: ..\Datasets_AfterPreprocessing\Endoscapes\Masks\6.0_combined\43_10950_6.0.png
Saved combined mask: ..\Datasets_AfterPreprocessing\Endoscapes\Masks\6.0_combined\119_66000_6.0.png
Saved combined mask: ..\Datasets_AfterPreprocessing\Endoscapes\Masks\6.0_combined\159_58550_6.0.png
Saved combined mask: ..\Datasets_AfterPreprocessing\Endoscapes\Masks\6.0_combined\65_20800_6.0.png
Saved combined mask: ..\Datasets_AfterPreprocessing\Endoscapes\Masks\6.0_combined\119_74250_6.0.png
Sav

## Rename "Masks" .png filenames into their original "Frame" names (run the following cell twice)

In [None]:
# Define the base folder for Masks
masks_folder = os.path.join("..", "Datasets_AfterPreprocessing", "Endoscapes", "Masks")

# Walk through each subdirectory in the Masks directory
for root, dirs, files in os.walk(masks_folder):
    for file_name in files:
        if file_name.endswith(".png"):
            # Use regex to find the second-to-last underscore and remove it and everything after
            new_name = re.sub(r'^(.*?_[^_]+)_[^_]+\.png$', r'\1.png', file_name)
            
            # Define full paths for renaming
            old_file_path = os.path.join(root, file_name)
            new_file_path = os.path.join(root, new_name)
            
            # Rename the file
            os.rename(old_file_path, new_file_path)
            print(f"Renamed '{file_name}' to '{new_name}'")

print("Renaming complete.")

Renamed '100_32425_1.0.png' to '100_32425.png'
Renamed '100_33925_1.0.png' to '100_33925.png'
Renamed '100_34675_1.0.png' to '100_34675.png'
Renamed '100_35425_1.0.png' to '100_35425.png'
Renamed '102_34250_1.0.png' to '102_34250.png'
Renamed '102_35000_1.0.png' to '102_35000.png'
Renamed '102_35750_1.0.png' to '102_35750.png'
Renamed '105_15700_1.0.png' to '105_15700.png'
Renamed '105_16450_1.0.png' to '105_16450.png'
Renamed '105_17200_1.0.png' to '105_17200.png'
Renamed '105_17950_1.0.png' to '105_17950.png'
Renamed '105_18700_1.0.png' to '105_18700.png'
Renamed '105_19450_1.0.png' to '105_19450.png'
Renamed '105_20200_1.0.png' to '105_20200.png'
Renamed '105_20950_1.0.png' to '105_20950.png'
Renamed '105_21700_1.0.png' to '105_21700.png'
Renamed '105_24700_1.0.png' to '105_24700.png'
Renamed '110_38800_1.0.png' to '110_38800.png'
Renamed '110_39550_1.0.png' to '110_39550.png'
Renamed '110_40300_1.0.png' to '110_40300.png'
Renamed '110_41800_1.0.png' to '110_41800.png'
Renamed '116_

## Renaming the classes folders into organs/tissues

1.0 - Cystic Plate
2.0 - Hepatocystic triangle
3.0 - Cystic Artery
4.0 - Cystic Duct
5.0 - Gall Bladder
6.0 - Instruments

In [None]:
# Define the mapping from numbers to names
class_mapping = {
    "1.0": "Cystic Plate",
    "2.0": "Hepatocystic triangle",
    "3.0": "Cystic Artery",
    "4.0": "Cystic Duct",
    "5.0": "Gall Bladder",
    "6.0": "Instruments"
}

# Iterate over each folder in the Masks directory
for folder_name in os.listdir(base_folder):
    folder_path = os.path.join(base_folder, folder_name)
    
    # Check if the folder name matches one in the mapping and is a directory
    if folder_name in class_mapping and os.path.isdir(folder_path):
        new_folder_name = class_mapping[folder_name]
        new_folder_path = os.path.join(base_folder, new_folder_name)
        
        # Rename the folder
        os.rename(folder_path, new_folder_path)
        print(f"Renamed '{folder_name}' to '{new_folder_name}'")

print("Renaming complete.")


Renamed '1.0' to 'Cystic Plate'
Renamed '2.0' to 'Hepatocystic triangle'
Renamed '3.0' to 'Cystic Artery'
Renamed '4.0' to 'Cystic Duct'
Renamed '5.0' to 'Gall Bladder'
Renamed '6.0' to 'Instruments'
Renaming complete.


## Splitting the whole Frames and Masks into train, val, and test (based on the val_seg and test_seg files)

In [None]:
# Paths for val_seg and test_seg directories
val_seg_folder = os.path.join(base_before_folder, "val_seg")
test_seg_folder = os.path.join(base_before_folder, "test_seg")

# Define the target splits and create required folders
splits = ["train", "val", "test"]
for split in splits:
    frames_path = os.path.join(base_after_folder, split, "Frames")
    masks_path = os.path.join(base_after_folder, split, "Masks")
    os.makedirs(frames_path, exist_ok=True)
    os.makedirs(masks_path, exist_ok=True)
    
    # Create class folders under Masks for each split
    for class_name in ["Cystic Plate", "Hepatocystic triangle", "Cystic Artery", "Cystic Duct", "Gall Bladder", "Instruments"]:
        os.makedirs(os.path.join(masks_path, class_name), exist_ok=True)

# Helper function to retrieve file names without extension
def get_file_stems(folder_path):
    return {os.path.splitext(file)[0] for file in os.listdir(folder_path) if file.endswith((".png", ".jpg"))}

# Get sets of file stems from val_seg and test_seg
val_filenames = get_file_stems(val_seg_folder)
test_filenames = get_file_stems(test_seg_folder)

# Paths for the source Frames and Masks folders
frames_folder = os.path.join(base_after_folder, "Frames")
masks_folder = os.path.join(base_after_folder, "Masks")

# Function to move frames and corresponding masks to appropriate split folders
def move_files(file_stem, split):
    # Define target paths
    target_frames_path = os.path.join(base_after_folder, split, "Frames")
    target_masks_path = os.path.join(base_after_folder, split, "Masks")
    
    # Move the frame file if it exists
    for ext in [".png", ".jpg"]:
        frame_file = os.path.join(frames_folder, f"{file_stem}{ext}")
        if os.path.exists(frame_file):
            shutil.move(frame_file, os.path.join(target_frames_path, f"{file_stem}{ext}"))
            break  # Stop after finding the file in either format

    # Move each mask for the given classes
    for class_name in ["Cystic Plate", "Hepatocystic triangle", "Cystic Artery", "Cystic Duct", "Gall Bladder", "Instruments"]:
        mask_file = os.path.join(masks_folder, class_name, f"{file_stem}.png")
        if os.path.exists(mask_file):
            shutil.move(mask_file, os.path.join(target_masks_path, class_name, f"{file_stem}.png"))

# Process each frame in the Frames folder and assign it to train, val, or test split
for file in os.listdir(frames_folder):
    if file.endswith((".png", ".jpg")):
        file_stem = os.path.splitext(file)[0]
        
        if file_stem in test_filenames:
            move_files(file_stem, "test")
        elif file_stem in val_filenames:
            move_files(file_stem, "val")
        else:
            move_files(file_stem, "train")

print("Files have been organized into train, val, and test splits with Frames and Masks subfolders.")


Files have been organized into train, val, and test splits with Frames and Masks subfolders.
