## m2caiSeg Dataset Preprocessing

#### (For relative paths) This notebook is located in a folder, which is in the same folder as the original dataset being preprocessed

Download dataset from -- "https://www.kaggle.com/datasets/salmanmaq/m2caiseg"

## All unique colors in each of the ground truth masks

In [None]:
import os
import cv2
import shutil
import random
import numpy as np
from PIL import Image
from collections import defaultdict

# Define relative paths to the base folder
base_before_folder = os.path.join("..", "Datasets_BeforePreprocessing", "m2caiSeg")
base_after_folder = os.path.join("..", "Datasets_AfterPreprocessing", "m2caiSeg")

# Define paths for the groundtruth folders in each dataset split
groundtruth_folders = {
    "trainval": os.path.join(base_before_folder, "trainval", "groundtruth"),
    "train": os.path.join(base_before_folder, "train", "groundtruth"),
    "test": os.path.join(base_before_folder, "test", "groundtruth")
}

# Function to get unique colors in an image
def get_unique_colors(image_path):
    image = Image.open(image_path).convert('RGB')  # Open the image and ensure it's in RGB mode
    unique_colors = image.getcolors(maxcolors=1000000)  # Set a high limit to get all unique colors
    return [color[1] for color in unique_colors]  # Extract only the color values

# Iterate over each groundtruth folder and each image within
for dataset_split, folder_path in groundtruth_folders.items():
    print(f"Processing {dataset_split} groundtruth images...")
    for filename in os.listdir(folder_path):
        image_path = os.path.join(folder_path, filename)
        if os.path.isfile(image_path):
            unique_colors = get_unique_colors(image_path)
            print(f"Unique colors in '{dataset_split}/{filename}': {unique_colors}")


Processing trainval groundtruth images...
Unique colors in 'trainval/00_gt.png': [(85, 170, 255), (85, 255, 170), (0, 255, 170), (170, 170, 170), (85, 170, 170), (0, 170, 170), (85, 255, 85), (170, 170, 85), (85, 170, 85), (0, 170, 85), (170, 85, 85), (85, 85, 85), (0, 85, 85), (170, 0, 85), (85, 255, 0), (85, 170, 0), (0, 170, 0), (85, 85, 0), (0, 0, 0)]
Unique colors in 'trainval/015975_gt.png': [(85, 170, 255), (85, 255, 170), (0, 255, 170), (85, 170, 170), (0, 170, 170), (0, 85, 170), (85, 255, 85), (0, 255, 85), (85, 170, 85), (0, 170, 85), (0, 85, 85), (85, 255, 0), (85, 170, 0), (0, 170, 0), (85, 85, 0), (0, 0, 0)]
Unique colors in 'trainval/030750_gt.png': [(85, 170, 255), (255, 0, 255), (170, 0, 255), (85, 255, 170), (170, 170, 170), (85, 170, 170), (0, 170, 170), (170, 85, 170), (0, 85, 170), (170, 0, 170), (85, 255, 85), (0, 255, 85), (170, 170, 85), (85, 170, 85), (0, 170, 85), (170, 85, 85), (85, 85, 85), (0, 85, 85), (170, 0, 85), (85, 0, 85), (85, 255, 0), (85, 170, 0), 

## Number of unique colors in each of the masks

In [None]:
# Function to get the count of unique colors in an image
def count_unique_colors(image_path):
    image = Image.open(image_path).convert('RGB')  # Open the image and ensure it's in RGB mode
    unique_colors = image.getcolors(maxcolors=1000000)  # Set a high limit to get all unique colors
    return len(unique_colors) if unique_colors else 0  # Return the count of unique colors

# Iterate over each groundtruth folder and each image within to count unique colors
for dataset_split, folder_path in groundtruth_folders.items():
    print(f"Processing {dataset_split} groundtruth images...")
    for filename in os.listdir(folder_path):
        image_path = os.path.join(folder_path, filename)
        if os.path.isfile(image_path):
            unique_color_count = count_unique_colors(image_path)
            print(f"Number of unique colors in '{dataset_split}/{filename}': {unique_color_count}")

Processing trainval groundtruth images...
Number of unique colors in 'trainval/00_gt.png': 19
Number of unique colors in 'trainval/015975_gt.png': 16
Number of unique colors in 'trainval/030750_gt.png': 26
Number of unique colors in 'trainval/03425_gt.png': 16
Number of unique colors in 'trainval/04175_gt.png': 28
Number of unique colors in 'trainval/05150_gt.png': 33
Number of unique colors in 'trainval/06625_gt.png': 16
Number of unique colors in 'trainval/0_gt.png': 22
Number of unique colors in 'trainval/10075_gt.png': 33
Number of unique colors in 'trainval/10275_gt.png': 17
Number of unique colors in 'trainval/1125_gt.png': 17
Number of unique colors in 'trainval/11400_gt.png': 17
Number of unique colors in 'trainval/1225_gt.png': 16
Number of unique colors in 'trainval/12550_gt.png': 15
Number of unique colors in 'trainval/13025_gt.png': 15
Number of unique colors in 'trainval/13700_gt.png': 27
Number of unique colors in 'trainval/14000_gt.png': 17
Number of unique colors in 'tr

## Editing masks to remove the unique mask colors less than a certain pixel areaa and also moving the "images" into "Frames" folders

In [None]:
# Define input and output directories for each dataset split
dataset_folders = {
    "trainval": {
        "input_images": os.path.join(base_before_folder, "trainval", "images"),
        "input_masks": os.path.join(base_before_folder, "trainval", "groundtruth"),
        "output_frames": os.path.join(base_after_folder, "trainval", "Frames"),
        "output_masks": os.path.join(base_after_folder, "trainval", "Masks")
    },
    "train": {
        "input_images": os.path.join(base_before_folder, "train", "images"),
        "input_masks": os.path.join(base_before_folder, "train", "groundtruth"),
        "output_frames": os.path.join(base_after_folder, "train", "Frames"),
        "output_masks": os.path.join(base_after_folder, "train", "Masks")
    },
    "test": {
        "input_images": os.path.join(base_before_folder, "test", "images"),
        "input_masks": os.path.join(base_before_folder, "test", "groundtruth"),
        "output_frames": os.path.join(base_after_folder, "test", "Frames"),
        "output_masks": os.path.join(base_after_folder, "test", "Masks")
    }
}

# Function to remove colors with pixel area < min_pixel_area and save the edited image
def remove_small_area_colors(image_path, output_path, min_pixel_area=75):
    image = Image.open(image_path).convert('RGB')  # Ensure the image is in RGB mode
    width, height = image.size
    pixels = image.load()

    # Calculate the area of each color
    color_count = image.getcolors(maxcolors=width * height)
    colors_to_remove = {color[1] for color in color_count if color[0] < min_pixel_area}

    # Create a new image with colors that meet the minimum pixel area requirement
    edited_image = Image.new('RGB', (width, height), (255, 255, 255))  # Start with a white canvas
    edited_pixels = edited_image.load()

    # Copy only pixels with sufficient area to the new image
    for y in range(height):
        for x in range(width):
            if pixels[x, y] not in colors_to_remove:
                edited_pixels[x, y] = pixels[x, y]

    # Save the edited image
    edited_image.save(output_path)

# Process each dataset folder
for split, paths in dataset_folders.items():
    input_images_dir = paths["input_images"]
    input_masks_dir = paths["input_masks"]
    output_frames_dir = paths["output_frames"]
    output_masks_dir = paths["output_masks"]

    # Ensure the output directories exist
    os.makedirs(output_frames_dir, exist_ok=True)
    os.makedirs(output_masks_dir, exist_ok=True)

    # Copy images to the Frames folder
    for filename in os.listdir(input_images_dir):
        source_path = os.path.join(input_images_dir, filename)
        destination_path = os.path.join(output_frames_dir, filename)
        if os.path.isfile(source_path):
            shutil.copy2(source_path, destination_path)
            print(f"Copied '{filename}' to '{output_frames_dir}' as part of '{split}'.")

    # Process and save the edited masks to the Masks folder
    for filename in os.listdir(input_masks_dir):
        mask_path = os.path.join(input_masks_dir, filename)
        if os.path.isfile(mask_path):
            output_path = os.path.join(output_masks_dir, filename)
            remove_small_area_colors(mask_path, output_path)
            print(f"Processed '{filename}' in '{split}' and saved to '{output_masks_dir}' folder.")


Copied '0.jpg' to '..\Datasets_AfterPreprocessing\m2caiSeg\trainval\Frames' as part of 'trainval'.
Copied '00.jpg' to '..\Datasets_AfterPreprocessing\m2caiSeg\trainval\Frames' as part of 'trainval'.
Copied '015975.jpg' to '..\Datasets_AfterPreprocessing\m2caiSeg\trainval\Frames' as part of 'trainval'.
Copied '030750.jpg' to '..\Datasets_AfterPreprocessing\m2caiSeg\trainval\Frames' as part of 'trainval'.
Copied '03425.jpg' to '..\Datasets_AfterPreprocessing\m2caiSeg\trainval\Frames' as part of 'trainval'.
Copied '04175.jpg' to '..\Datasets_AfterPreprocessing\m2caiSeg\trainval\Frames' as part of 'trainval'.
Copied '05150.jpg' to '..\Datasets_AfterPreprocessing\m2caiSeg\trainval\Frames' as part of 'trainval'.
Copied '06625.jpg' to '..\Datasets_AfterPreprocessing\m2caiSeg\trainval\Frames' as part of 'trainval'.
Copied '10075.jpg' to '..\Datasets_AfterPreprocessing\m2caiSeg\trainval\Frames' as part of 'trainval'.
Copied '10275.jpg' to '..\Datasets_AfterPreprocessing\m2caiSeg\trainval\Frames

## Recalculating the number of unique mask colors in each of the masks after above preprocessing

In [None]:
# Define paths for each dataset split's Masks folder
dataset_folders = {
    "trainval": os.path.join(base_after_folder, "trainval", "Masks"),
    "train": os.path.join(base_after_folder, "train", "Masks"),
    "test": os.path.join(base_after_folder, "test", "Masks")
}

# Function to get the count of unique colors in an image
def count_unique_colors(image_path):
    image = Image.open(image_path).convert('RGB')  # Open the image and ensure it's in RGB mode
    unique_colors = image.getcolors(maxcolors=1000000)  # Set a high limit to get all unique colors
    return len(unique_colors) if unique_colors else 0  # Return the count of unique colors

# Process each dataset split and count unique colors in each mask
for split, masks_dir in dataset_folders.items():
    print(f"Processing unique colors in '{split}' Masks folder...")
    for filename in os.listdir(masks_dir):
        mask_path = os.path.join(masks_dir, filename)
        if os.path.isfile(mask_path):
            unique_color_count = count_unique_colors(mask_path)
            print(f"Number of unique colors in '{split}/{filename}': {unique_color_count}")


Processing unique colors in 'trainval' Masks folder...
Number of unique colors in 'trainval/00_gt.png': 10
Number of unique colors in 'trainval/015975_gt.png': 9
Number of unique colors in 'trainval/030750_gt.png': 13
Number of unique colors in 'trainval/03425_gt.png': 10
Number of unique colors in 'trainval/04175_gt.png': 12
Number of unique colors in 'trainval/05150_gt.png': 15
Number of unique colors in 'trainval/06625_gt.png': 12
Number of unique colors in 'trainval/0_gt.png': 6
Number of unique colors in 'trainval/10075_gt.png': 14
Number of unique colors in 'trainval/10275_gt.png': 10
Number of unique colors in 'trainval/1125_gt.png': 10
Number of unique colors in 'trainval/11400_gt.png': 10
Number of unique colors in 'trainval/1225_gt.png': 10
Number of unique colors in 'trainval/12550_gt.png': 12
Number of unique colors in 'trainval/13025_gt.png': 10
Number of unique colors in 'trainval/13700_gt.png': 12
Number of unique colors in 'trainval/14000_gt.png': 10
Number of unique co

In [None]:
# Function to get unique colors in an image
def get_unique_colors(image_path):
    image = Image.open(image_path).convert('RGB')  # Open the image and ensure it's in RGB mode
    unique_colors = image.getcolors(maxcolors=1000000)  # Set a high limit to get all unique colors
    return {color[1] for color in unique_colors} if unique_colors else set()  # Extract only color values as a set

# Store overall unique colors for each dataset split
overall_unique_colors = {
    "trainval": set(),
    "train": set(),
    "test": set()
}

# Process each dataset split and collect unique colors across all masks
for split, masks_dir in dataset_folders.items():
    print(f"Collecting unique colors in '{split}' Masks folder...")
    for filename in os.listdir(masks_dir):
        mask_path = os.path.join(masks_dir, filename)
        if os.path.isfile(mask_path):
            unique_colors = get_unique_colors(mask_path)
            overall_unique_colors[split].update(unique_colors)

# Print overall unique colors found in each dataset split
for split, colors in overall_unique_colors.items():
    print(f"\nOverall unique colors in '{split}' Masks folder: {sorted(colors)}")

Collecting unique colors in 'trainval' Masks folder...
Collecting unique colors in 'train' Masks folder...
Collecting unique colors in 'test' Masks folder...

Overall unique colors in 'trainval' Masks folder: [(0, 0, 0), (0, 85, 170), (0, 85, 255), (0, 170, 0), (0, 170, 85), (0, 170, 170), (0, 255, 85), (0, 255, 170), (85, 0, 170), (85, 0, 255), (85, 85, 0), (85, 85, 85), (85, 85, 170), (85, 85, 255), (85, 170, 0), (85, 170, 85), (85, 170, 170), (85, 170, 255), (85, 255, 0), (85, 255, 85), (85, 255, 170), (170, 0, 0), (170, 0, 85), (170, 0, 170), (170, 0, 255), (170, 85, 0), (170, 85, 85), (170, 85, 170), (170, 170, 0), (170, 170, 85), (170, 170, 170), (255, 0, 0), (255, 0, 255), (255, 85, 0), (255, 255, 0), (255, 255, 255)]

Overall unique colors in 'train' Masks folder: [(0, 0, 0), (0, 85, 170), (0, 85, 255), (0, 170, 0), (0, 170, 85), (0, 170, 170), (0, 255, 85), (0, 255, 170), (85, 0, 170), (85, 0, 255), (85, 85, 0), (85, 85, 85), (85, 85, 170), (85, 85, 255), (85, 170, 0), (85, 17

## Grey-scaling each of the masks into each of its unique colors

In [None]:
# Function to create a grayscale mask for each unique color in a mask image
def create_grayscale_masks(image_path, output_base_dir):
    # Open the image
    image = Image.open(image_path).convert('RGB')
    width, height = image.size

    # Get unique colors in the image
    unique_colors = image.getcolors(maxcolors=width * height)
    if unique_colors is None:
        print(f"Too many unique colors in '{image_path}' to process.")
        return

    # Create a grayscale mask for each unique color
    for color_count, color in unique_colors:
        # Define directory for this unique color
        color_dir = os.path.join(output_base_dir, f"{color}")
        os.makedirs(color_dir, exist_ok=True)

        # Create a new grayscale image for the color mask
        grayscale_image = Image.new('L', (width, height), 0)  # 'L' mode for grayscale
        original_pixels = image.load()
        grayscale_pixels = grayscale_image.load()

        # Set pixels matching the unique color to 255, others to 0
        for y in range(height):
            for x in range(width):
                if original_pixels[x, y] == color:
                    grayscale_pixels[x, y] = 255

        # Save the grayscale mask image in the unique color's folder
        base_filename = os.path.basename(image_path)
        grayscale_image.save(os.path.join(color_dir, base_filename))

# Process each dataset split and generate grayscale masks for each unique color
for split, masks_dir in dataset_folders.items():
    print(f"Processing grayscale masks for '{split}' Masks folder...")
    for filename in os.listdir(masks_dir):
        mask_path = os.path.join(masks_dir, filename)
        if os.path.isfile(mask_path):
            create_grayscale_masks(mask_path, masks_dir)
            print(f"Generated grayscale masks for unique colors in '{split}/{filename}'")


Processing grayscale masks for 'trainval' Masks folder...
Generated grayscale masks for unique colors in 'trainval/00_gt.png'
Generated grayscale masks for unique colors in 'trainval/015975_gt.png'
Generated grayscale masks for unique colors in 'trainval/030750_gt.png'
Generated grayscale masks for unique colors in 'trainval/03425_gt.png'
Generated grayscale masks for unique colors in 'trainval/04175_gt.png'
Generated grayscale masks for unique colors in 'trainval/05150_gt.png'
Generated grayscale masks for unique colors in 'trainval/06625_gt.png'
Generated grayscale masks for unique colors in 'trainval/0_gt.png'
Generated grayscale masks for unique colors in 'trainval/10075_gt.png'
Generated grayscale masks for unique colors in 'trainval/10275_gt.png'
Generated grayscale masks for unique colors in 'trainval/1125_gt.png'
Generated grayscale masks for unique colors in 'trainval/11400_gt.png'
Generated grayscale masks for unique colors in 'trainval/1225_gt.png'
Generated grayscale masks 

## Based on the classses in (https://github.com/salmanmaq/segmentationNetworks/blob/master/datasets/miccaiSegClasses.json), we only kept those colored mask classes/folders

In [None]:
# List of allowed colors from the class list (in RGB format)
allowed_colors = [
    (170, 0, 85), (0, 85, 170), (0, 85, 255), (0, 170, 85), (0, 255, 85),
    (0, 255, 170), (85, 0, 170), (85, 0, 255), (170, 85, 85), (170, 170, 170),
    (85, 170, 0), (85, 170, 255), (85, 255, 0), (85, 255, 170), (170, 0, 255),
    (255, 0, 255), (255, 255, 0), (255, 0, 0), (0, 0, 0)
]

# Function to delete folders not in allowed colors
def remove_unwanted_color_folders(masks_dir):
    # Iterate over each folder in the Masks directory
    for folder_name in os.listdir(masks_dir):
        # Convert folder name to tuple for color comparison
        try:
            folder_color = tuple(map(int, folder_name.strip("()").split(",")))
        except ValueError:
            continue  # Skip if folder name is not in RGB format

        # Check if color is not in allowed colors
        if folder_color not in allowed_colors:
            folder_path = os.path.join(masks_dir, folder_name)
            shutil.rmtree(folder_path)
            print(f"Removed folder with unlisted color: {folder_path}")

# Process each dataset split's Masks folder
for split, masks_dir in dataset_folders.items():
    print(f"Cleaning up unwanted color folders in '{split}' Masks folder...")
    remove_unwanted_color_folders(masks_dir)

print("Unwanted color folders removed successfully.")


Cleaning up unwanted color folders in 'trainval' Masks folder...
Removed folder with unlisted color: ..\Datasets_AfterPreprocessing\m2caiSeg\trainval\Masks\(0, 170, 0)
Removed folder with unlisted color: ..\Datasets_AfterPreprocessing\m2caiSeg\trainval\Masks\(0, 170, 170)
Removed folder with unlisted color: ..\Datasets_AfterPreprocessing\m2caiSeg\trainval\Masks\(170, 0, 0)
Removed folder with unlisted color: ..\Datasets_AfterPreprocessing\m2caiSeg\trainval\Masks\(170, 0, 170)
Removed folder with unlisted color: ..\Datasets_AfterPreprocessing\m2caiSeg\trainval\Masks\(170, 170, 0)
Removed folder with unlisted color: ..\Datasets_AfterPreprocessing\m2caiSeg\trainval\Masks\(170, 170, 85)
Removed folder with unlisted color: ..\Datasets_AfterPreprocessing\m2caiSeg\trainval\Masks\(170, 85, 0)
Removed folder with unlisted color: ..\Datasets_AfterPreprocessing\m2caiSeg\trainval\Masks\(170, 85, 170)
Removed folder with unlisted color: ..\Datasets_AfterPreprocessing\m2caiSeg\trainval\Masks\(255, 2

## Rename the classes to their class names from their color RGB values

In [None]:
# Define the color-to-class name mapping
color_to_class_name = {
    "(170, 0, 85)": "unknown",
    "(0, 85, 170)": "grasper",
    "(0, 85, 255)": "bipolar",
    "(0, 170, 85)": "hook",
    "(0, 255, 85)": "scissors",
    "(0, 255, 170)": "clipper",
    "(85, 0, 170)": "irrigator",
    "(85, 0, 255)": "specimen-bag",
    "(170, 85, 85)": "trocars",
    "(170, 170, 170)": "clip",
    "(85, 170, 0)": "liver",
    "(85, 170, 255)": "gall-bladder",
    "(85, 255, 0)": "fat",
    "(85, 255, 170)": "upperwall",
    "(170, 0, 255)": "artery",
    "(255, 0, 255)": "intestine",
    "(255, 255, 0)": "bile",
    "(255, 0, 0)": "blood",
    "(0, 0, 0)": "black"
}

# Folders to process
dataset_folders = ["trainval", "train", "test"]

# Iterate over each dataset folder and rename color folders
for split in dataset_folders:
    masks_dir = os.path.join(base_after_folder, split, "Masks")
    for folder_name in os.listdir(masks_dir):
        if folder_name in color_to_class_name:
            # Get the new name for the folder
            new_folder_name = color_to_class_name[folder_name]
            old_folder_path = os.path.join(masks_dir, folder_name)
            new_folder_path = os.path.join(masks_dir, new_folder_name)
            
            # Rename the folder
            if not os.path.exists(new_folder_path):  # Avoid overwriting
                os.rename(old_folder_path, new_folder_path)
                print(f"Renamed '{folder_name}' to '{new_folder_name}' in '{split}' Masks folder.")
            else:
                print(f"Skipped renaming '{folder_name}' to '{new_folder_name}' as it already exists.")


Renamed '(0, 0, 0)' to 'black' in 'trainval' Masks folder.
Renamed '(0, 170, 85)' to 'hook' in 'trainval' Masks folder.
Renamed '(0, 255, 170)' to 'clipper' in 'trainval' Masks folder.
Renamed '(0, 255, 85)' to 'scissors' in 'trainval' Masks folder.
Renamed '(0, 85, 170)' to 'grasper' in 'trainval' Masks folder.
Renamed '(0, 85, 255)' to 'bipolar' in 'trainval' Masks folder.
Renamed '(170, 0, 255)' to 'artery' in 'trainval' Masks folder.
Renamed '(170, 0, 85)' to 'unknown' in 'trainval' Masks folder.
Renamed '(170, 170, 170)' to 'clip' in 'trainval' Masks folder.
Renamed '(170, 85, 85)' to 'trocars' in 'trainval' Masks folder.
Renamed '(255, 0, 0)' to 'blood' in 'trainval' Masks folder.
Renamed '(255, 0, 255)' to 'intestine' in 'trainval' Masks folder.
Renamed '(255, 255, 0)' to 'bile' in 'trainval' Masks folder.
Renamed '(85, 0, 170)' to 'irrigator' in 'trainval' Masks folder.
Renamed '(85, 0, 255)' to 'specimen-bag' in 'trainval' Masks folder.
Renamed '(85, 170, 0)' to 'liver' in 'tr

## Removing the "_gt" from the mask files names for easy handling

In [None]:
# List of dataset splits
dataset_folders = ["trainval", "train", "test"]

# Loop through each dataset split and traverse all subfolders to remove "_gt" in file names
for split in dataset_folders:
    masks_dir = os.path.join(base_after_folder, split, "Masks")
    
    # Walk through all subdirectories and files in the Masks directory
    for root, _, files in os.walk(masks_dir):
        for file_name in files:
            # Check if "_gt" is in the file name before the extension
            if "_gt" in file_name:
                new_name = file_name.replace("_gt", "")  # Remove "_gt" from the file name
                old_file_path = os.path.join(root, file_name)
                new_file_path = os.path.join(root, new_name)
                
                # Rename the file
                os.rename(old_file_path, new_file_path)
                print(f"Renamed '{file_name}' to '{new_name}' in folder '{root}' for '{split}'")

print("Renaming complete.")


Renamed '04175_gt.png' to '04175.png' in folder '..\Datasets_AfterPreprocessing\m2caiSeg\trainval\Masks\artery' for 'trainval'
Renamed '05150_gt.png' to '05150.png' in folder '..\Datasets_AfterPreprocessing\m2caiSeg\trainval\Masks\artery' for 'trainval'
Renamed '14500_gt.png' to '14500.png' in folder '..\Datasets_AfterPreprocessing\m2caiSeg\trainval\Masks\artery' for 'trainval'
Renamed '1700_gt.png' to '1700.png' in folder '..\Datasets_AfterPreprocessing\m2caiSeg\trainval\Masks\artery' for 'trainval'
Renamed '1950_gt.png' to '1950.png' in folder '..\Datasets_AfterPreprocessing\m2caiSeg\trainval\Masks\artery' for 'trainval'
Renamed '20400_gt.png' to '20400.png' in folder '..\Datasets_AfterPreprocessing\m2caiSeg\trainval\Masks\artery' for 'trainval'
Renamed '20650_gt.png' to '20650.png' in folder '..\Datasets_AfterPreprocessing\m2caiSeg\trainval\Masks\artery' for 'trainval'
Renamed '20900_gt.png' to '20900.png' in folder '..\Datasets_AfterPreprocessing\m2caiSeg\trainval\Masks\artery' for

## Removing masks with largest isolated island area less than 50 pixels (due to errors in mask creation?)

In [None]:
# Threshold for the largest connected component area
min_area_threshold = 50

# Function to calculate the area of the largest connected component
def largest_connected_component_area(image_path):
    # Open the image and convert to grayscale
    image = Image.open(image_path).convert("L")
    image_np = np.array(image)

    # Binarize the image (assuming non-zero pixels are part of the mask)
    _, binary_image = cv2.threshold(image_np, 1, 255, cv2.THRESH_BINARY)

    # Find connected components
    num_labels, labels, stats, _ = cv2.connectedComponentsWithStats(binary_image, connectivity=8)

    # If there are no components, return 0
    if num_labels <= 1:
        return 0

    # Exclude the background component and find the largest area
    largest_area = max(stat[cv2.CC_STAT_AREA] for stat in stats[1:])
    return largest_area

# Process each dataset split
for split in dataset_folders:
    masks_dir = os.path.join(base_after_folder, split, "Masks")
    
    # Walk through all subdirectories and mask files
    for root, _, files in os.walk(masks_dir):
        for file_name in files:
            mask_path = os.path.join(root, file_name)
            
            # Check the largest connected component area
            largest_area = largest_connected_component_area(mask_path)
            
            # Delete file if largest connected component area is below the threshold
            if largest_area < min_area_threshold:
                os.remove(mask_path)
                print(f"Removed '{file_name}' in '{root}' as largest component area was {largest_area} pixels.")
                
print("Mask cleanup complete.")


Removed '04175.png' in '..\Datasets_AfterPreprocessing\m2caiSeg\trainval\Masks\artery' as largest component area was 49 pixels.
Removed '05150.png' in '..\Datasets_AfterPreprocessing\m2caiSeg\trainval\Masks\artery' as largest component area was 36 pixels.
Removed '14500.png' in '..\Datasets_AfterPreprocessing\m2caiSeg\trainval\Masks\artery' as largest component area was 7 pixels.
Removed '1700.png' in '..\Datasets_AfterPreprocessing\m2caiSeg\trainval\Masks\artery' as largest component area was 34 pixels.
Removed '1950.png' in '..\Datasets_AfterPreprocessing\m2caiSeg\trainval\Masks\artery' as largest component area was 27 pixels.
Removed '2275.png' in '..\Datasets_AfterPreprocessing\m2caiSeg\trainval\Masks\artery' as largest component area was 16 pixels.
Removed '2700.png' in '..\Datasets_AfterPreprocessing\m2caiSeg\trainval\Masks\artery' as largest component area was 17 pixels.
Removed '2950.png' in '..\Datasets_AfterPreprocessing\m2caiSeg\trainval\Masks\artery' as largest component ar

## Additional manual removal of masks that did not have any organ masks was also done

## Reshuffling the mask split to have better distribution of classes of masks

In [None]:
# Define the base directory
base_after_dir = os.path.join("..", "Datasets_AfterPreprocessing", "m2caiSeg")
splits = ["train", "val", "test"]

# Define paths for the Masks in each split
mask_split_dirs = {split: os.path.join(base_after_dir, split, "Masks") for split in splits}

# Temporary directory for unique masks
temp_dir = os.path.join(base_after_dir, "TempMasks")
os.makedirs(temp_dir, exist_ok=True)

# List of organ classes
organ_classes = [
    "upperwall", "scissors", "bipolar", "intestine",
    "artery", "bile", "black", "blood", "clip", "clipper", "fat", "grasper",
    "hook", "irrigator", "liver", "specimen-bag", "trocars", "unknown"
]

# Step 1: Collect all unique mask files into the TempMasks folder
unique_mask_files = defaultdict(set)

print("Collecting unique mask files into TempMasks...")
for split in splits:
    for organ in organ_classes:
        organ_path = os.path.join(mask_split_dirs[split], organ)
        if os.path.exists(organ_path):
            for mask_file in os.listdir(organ_path):
                mask_path = os.path.join(organ_path, mask_file)
                if os.path.isfile(mask_path):
                    unique_mask_files[organ].add(mask_file)
                    temp_organ_dir = os.path.join(temp_dir, organ)
                    os.makedirs(temp_organ_dir, exist_ok=True)
                    shutil.copy2(mask_path, temp_organ_dir)

# Step 2: Remove masks from their original locations
print("Removing masks from original locations...")
for split in splits:
    for organ in organ_classes:
        organ_path = os.path.join(mask_split_dirs[split], organ)
        if os.path.exists(organ_path):
            for mask_file in os.listdir(organ_path):
                mask_path = os.path.join(organ_path, mask_file)
                if os.path.isfile(mask_path):
                    os.remove(mask_path)

# Step 3: Distribute masks back to train/val/test in 80/10/10 ratio
print("Distributing masks to train/val/test in 80/10/10 ratio...")
for organ, masks in unique_mask_files.items():
    masks = list(masks)
    random.shuffle(masks)
    total_masks = len(masks)

    # Calculate split indices
    train_idx = int(total_masks * 0.8)
    val_idx = train_idx + int(total_masks * 0.1)

    # Split masks into train, val, and test
    split_masks = {
        "train": masks[:train_idx],
        "val": masks[train_idx:val_idx],
        "test": masks[val_idx:]
    }

    # Copy masks to their respective directories
    for split, mask_list in split_masks.items():
        organ_split_dir = os.path.join(mask_split_dirs[split], organ)
        os.makedirs(organ_split_dir, exist_ok=True)
        for mask_file in mask_list:
            temp_organ_dir = os.path.join(temp_dir, organ)
            temp_mask_path = os.path.join(temp_organ_dir, mask_file)
            if os.path.exists(temp_mask_path):
                shutil.copy2(temp_mask_path, organ_split_dir)

# Step 4: Clean up temporary folder
print("Cleaning up TempMasks folder...")
shutil.rmtree(temp_dir)

# Step 5: Print final summary
print("\nFinal Split Summary:")
for split in splits:
    print(f"\n{split.capitalize()} Split:")
    for organ in organ_classes:
        organ_split_dir = os.path.join(mask_split_dirs[split], organ)
        if os.path.exists(organ_split_dir):
            print(f"  {organ}: {len(os.listdir(organ_split_dir))} masks")


Collecting unique mask files into TempMasks...
Removing masks from original locations...
Distributing masks to train/val/test in 80/10/10 ratio...
Cleaning up TempMasks folder...

Final Split Summary:

Train Split:
  upperwall: 178 masks
  scissors: 12 masks
  bipolar: 8 masks
  intestine: 64 masks
  artery: 90 masks
  bile: 5 masks
  blood: 17 masks
  clipper: 41 masks
  fat: 231 masks
  grasper: 205 masks
  hook: 75 masks
  irrigator: 8 masks
  liver: 237 masks
  specimen-bag: 40 masks
  trocars: 11 masks
  unknown: 31 masks

Val Split:
  upperwall: 22 masks
  scissors: 1 masks
  bipolar: 1 masks
  intestine: 8 masks
  artery: 11 masks
  bile: 0 masks
  blood: 2 masks
  clipper: 5 masks
  fat: 28 masks
  grasper: 25 masks
  hook: 9 masks
  irrigator: 1 masks
  liver: 29 masks
  specimen-bag: 5 masks
  trocars: 1 masks
  unknown: 3 masks

Test Split:
  upperwall: 23 masks
  scissors: 2 masks
  bipolar: 2 masks
  intestine: 9 masks
  artery: 12 masks
  bile: 2 masks
  blood: 3 masks
  

## Reshuffling Frames according to the mask split

In [None]:
# Paths for Masks and Frames in each split
mask_split_dirs = {split: os.path.join(base_after_dir, split, "Masks") for split in splits}
frame_split_dirs = {split: os.path.join(base_after_dir, split, "Frames") for split in splits}

# Temporary directory containing all unique frames
temp_frames_dir = os.path.join(base_after_dir, "TempFrames")

# Ensure TempFrames exists
if not os.path.exists(temp_frames_dir):
    print("Error: TempFrames directory does not exist.")
    exit()

# Step 1: Distribute frames to train/val/test based on masks
print("Distributing frames to train/val/test based on masks...")

missing_frames = defaultdict(list)

for split in splits:
    masks_dir = mask_split_dirs[split]
    frames_dir = frame_split_dirs[split]
    os.makedirs(frames_dir, exist_ok=True)

    # Iterate over all organ class subfolders in the Masks directory
    for organ_folder in os.listdir(masks_dir):
        organ_path = os.path.join(masks_dir, organ_folder)
        if os.path.isdir(organ_path):
            for mask_file in os.listdir(organ_path):
                mask_name = os.path.splitext(mask_file)[0]  # Get the base name of the mask file
                temp_frame_path = os.path.join(temp_frames_dir, f"{mask_name}.jpg")  # Match `.jpg` files in TempFrames
                dest_frame_path = os.path.join(frames_dir, f"{mask_name}.jpg")  # Save as `.jpg`

                # Copy the frame back to the respective split if it exists
                if os.path.exists(temp_frame_path):
                    shutil.copy2(temp_frame_path, dest_frame_path)
                else:
                    # Log missing frames
                    missing_frames[split].append(os.path.join(organ_folder, f"{mask_name}.jpg"))

# Step 2: Report missing frames
print("\nMissing Frame Report:")
total_missing = 0
for split, missing in missing_frames.items():
    print(f"\nSplit: {split}")
    print(f"Missing frames for {len(missing)} masks:")
    if missing:
        for frame_file in missing[:10]:  # Show up to 10 missing files per split for brevity
            print(f"  {frame_file}")
    total_missing += len(missing)

# Step 3: Final Split Summary
print("\nFinal Split Summary:")
for split in splits:
    frames_dir = frame_split_dirs[split]
    if os.path.exists(frames_dir):
        frame_count = len(os.listdir(frames_dir))
        print(f"{split.capitalize()} Split: {frame_count} frames")

# Step 4: Cleanup TempFrames if all frames were successfully distributed
if total_missing == 0:
    print("\nAll frames distributed successfully. Cleaning up TempFrames folder...")
    shutil.rmtree(temp_frames_dir)
else:
    print("\nSome frames were not distributed. Retaining TempFrames folder for debugging.")
    print(f"Total missing frames: {total_missing}")


Distributing frames to train/val/test based on masks...

Missing Frame Report:

Final Split Summary:
Train Split: 306 frames
Val Split: 277 frames
Test Split: 177 frames

All frames distributed successfully. Cleaning up TempFrames folder...


## Checking if every masks has a corresponsing Frame in its Frame folders 

In [None]:
# Step 1: Collect all unique frame file names
unique_frame_files = set()

print("Collecting all unique frame file names across splits...")
for split in splits:
    frames_dir = frame_split_dirs[split]
    if os.path.exists(frames_dir):
        for frame_file in os.listdir(frames_dir):
            if os.path.isfile(os.path.join(frames_dir, frame_file)):
                # Normalize file names
                unique_frame_files.add(frame_file.strip().lower())

print(f"Total unique frame files collected: {len(unique_frame_files)}")

# Step 2: Collect all mask file names
all_mask_files = set()

print("\nCollecting all mask file names across splits...")
for split in splits:
    masks_dir = mask_split_dirs[split]
    if os.path.exists(masks_dir):
        for organ_folder in os.listdir(masks_dir):
            organ_path = os.path.join(masks_dir, organ_folder)
            if os.path.isdir(organ_path):
                for mask_file in os.listdir(organ_path):
                    # Normalize mask file names and convert to frame file names
                    frame_file_name = os.path.splitext(mask_file.strip().lower())[0] + ".jpg"
                    all_mask_files.add(frame_file_name)

print(f"Total unique mask files collected: {len(all_mask_files)}")

# Step 3: Compare mask file names with frame file names
missing_frames = all_mask_files - unique_frame_files

# Step 4: Report missing frames
print("\nMissing Frames Report:")
print(f"Total missing frames: {len(missing_frames)}")
for missing_frame in sorted(missing_frames):
    print(f"  {missing_frame}")


Collecting all unique frame file names across splits...
Total unique frame files collected: 307

Collecting all mask file names across splits...
Total unique mask files collected: 307

Missing Frames Report:
Total missing frames: 0


## Checking shapes of images vs masks

In [None]:
# Initialize mismatch counts
mismatch_counts = {}
not_equal_records = []

# Iterate through each split and check the shapes for equality
for split in splits:
    frames_path = frame_split_dirs[split]
    masks_path = mask_split_dirs[split]

    if not os.path.exists(frames_path) or not os.path.exists(masks_path):
        print(f"Missing directory for split: {split}")
        continue

    # Get list of all frame images
    frame_files = {os.path.splitext(f)[0]: f for f in os.listdir(frames_path) if f.endswith(('.jpg', '.png'))}

    # Iterate through organ folders in Masks
    for organ in os.listdir(masks_path):
        organ_path = os.path.join(masks_path, organ)
        if organ not in mismatch_counts:
            mismatch_counts[organ] = 0

        # Get list of mask images in this organ folder
        mask_files = [f for f in os.listdir(organ_path) if f.endswith('.png')]

        for mask_file in mask_files:
            mask_name = os.path.splitext(mask_file)[0]
            mask_path = os.path.join(organ_path, mask_file)

            # Check if the corresponding frame exists
            if mask_name in frame_files:
                frame_path = os.path.join(frames_path, frame_files[mask_name])

                try:
                    # Load frame and mask images
                    frame_img = Image.open(frame_path)
                    mask_img = Image.open(mask_path)

                    # Compare dimensions and log results
                    if frame_img.size == mask_img.size:
                        print(f"[{split}] Shape of mask: {mask_img.size} == Shape of frame: {frame_img.size} (Equal)")
                    else:
                        not_equal_records.append(
                            f"[{split}] Shape of mask: {mask_img.size} != Shape of frame: {frame_img.size} (Not Equal) "
                            f"[Organ: {organ}, Mask: {mask_file}, Frame: {frame_files[mask_name]}]"
                        )
                        mismatch_counts[organ] += 1
                except Exception as e:
                    print(f"Error processing {mask_name} in split {split}: {e}")
            else:
                print(f"[{split}] Missing corresponding frame for mask: {mask_file} in organ: {organ}")

# Print all mismatches at the end
print("\nNot Equal Records:")
for record in not_equal_records:
    print(record)

print("\nMismatch counts by organ:")
for organ, count in mismatch_counts.items():
    print(f"{organ}: {count}")


[train] Shape of mask: (774, 434) == Shape of frame: (774, 434) (Equal)
[train] Shape of mask: (774, 434) == Shape of frame: (774, 434) (Equal)
[train] Shape of mask: (774, 434) == Shape of frame: (774, 434) (Equal)
[train] Shape of mask: (774, 434) == Shape of frame: (774, 434) (Equal)
[train] Shape of mask: (774, 434) == Shape of frame: (774, 434) (Equal)
[train] Shape of mask: (774, 434) == Shape of frame: (774, 434) (Equal)
[train] Shape of mask: (774, 434) == Shape of frame: (774, 434) (Equal)
[train] Shape of mask: (774, 434) == Shape of frame: (774, 434) (Equal)
[train] Shape of mask: (774, 434) == Shape of frame: (774, 434) (Equal)
[train] Shape of mask: (774, 434) == Shape of frame: (774, 434) (Equal)
[train] Shape of mask: (596, 334) == Shape of frame: (596, 334) (Equal)
[train] Shape of mask: (596, 334) == Shape of frame: (596, 334) (Equal)
[train] Shape of mask: (596, 334) == Shape of frame: (596, 334) (Equal)
[train] Shape of mask: (596, 334) == Shape of frame: (596, 334) 

## Removing these misamtched files

In [None]:
# Define mismatched records to remove
not_equal_records = [
    "[train] Shape of mask: (774, 434) != Shape of frame: (596, 334) (Not Equal) [Organ: fat, Mask: 30750.png, Frame: 30750.jpg]",
    "[train] Shape of mask: (774, 434) != Shape of frame: (596, 334) (Not Equal) [Organ: fat, Mask: 4175.png, Frame: 4175.jpg]",
    "[train] Shape of mask: (774, 434) != Shape of frame: (596, 334) (Not Equal) [Organ: fat, Mask: 5150.png, Frame: 5150.jpg]",
    "[train] Shape of mask: (774, 434) != Shape of frame: (596, 334) (Not Equal) [Organ: grasper, Mask: 4175.png, Frame: 4175.jpg]",
    "[train] Shape of mask: (774, 434) != Shape of frame: (596, 334) (Not Equal) [Organ: hook, Mask: 30750.png, Frame: 30750.jpg]",
    "[train] Shape of mask: (774, 434) != Shape of frame: (596, 334) (Not Equal) [Organ: hook, Mask: 4175.png, Frame: 4175.jpg]",
    "[train] Shape of mask: (774, 434) != Shape of frame: (596, 334) (Not Equal) [Organ: hook, Mask: 5150.png, Frame: 5150.jpg]",
    "[train] Shape of mask: (774, 434) != Shape of frame: (596, 334) (Not Equal) [Organ: intestine, Mask: 5150.png, Frame: 5150.jpg]",
    "[train] Shape of mask: (774, 434) != Shape of frame: (596, 334) (Not Equal) [Organ: liver, Mask: 30750.png, Frame: 30750.jpg]",
    "[train] Shape of mask: (774, 434) != Shape of frame: (596, 334) (Not Equal) [Organ: liver, Mask: 4175.png, Frame: 4175.jpg]",
    "[train] Shape of mask: (774, 434) != Shape of frame: (596, 334) (Not Equal) [Organ: liver, Mask: 5150.png, Frame: 5150.jpg]",
    "[val] Shape of mask: (774, 434) != Shape of frame: (596, 334) (Not Equal) [Organ: gall-bladder, Mask: 30750.png, Frame: 30750.jpg]",
    "[val] Shape of mask: (774, 434) != Shape of frame: (596, 334) (Not Equal) [Organ: gall-bladder, Mask: 4175.png, Frame: 4175.jpg]",
    "[val] Shape of mask: (774, 434) != Shape of frame: (596, 334) (Not Equal) [Organ: gall-bladder, Mask: 5150.png, Frame: 5150.jpg]",
    "[val] Shape of mask: (774, 434) != Shape of frame: (596, 334) (Not Equal) [Organ: grasper, Mask: 5150.png, Frame: 5150.jpg]",
    "[val] Shape of mask: (774, 434) != Shape of frame: (596, 334) (Not Equal) [Organ: intestine, Mask: 4175.png, Frame: 4175.jpg]",
    "[test] Shape of mask: (774, 434) != Shape of frame: (596, 334) (Not Equal) [Organ: gall-bladder, Mask: 30750.png, Frame: 30750.jpg]",
    "[test] Shape of mask: (774, 434) != Shape of frame: (596, 334) (Not Equal) [Organ: gall-bladder, Mask: 4175.png, Frame: 4175.jpg]",
    "[test] Shape of mask: (774, 434) != Shape of frame: (596, 334) (Not Equal) [Organ: gall-bladder, Mask: 5150.png, Frame: 5150.jpg]"
]

# Parse and remove files
for record in not_equal_records:
    # Extract split, organ, and filenames from the record
    split = record.split("[")[1].split("]")[0]
    organ = record.split("Organ: ")[1].split(",")[0]
    mask_file = record.split("Mask: ")[1].split(",")[0].strip()
    frame_file = record.split("Frame: ")[1].split("]")[0].strip()

    # Construct paths for mask and frame
    mask_path = os.path.join(base_after_dir, split, "Masks", organ, mask_file)
    frame_path = os.path.join(base_after_dir, split, "Frames", frame_file)

    # Remove the mask file if it exists
    if os.path.exists(mask_path):
        os.remove(mask_path)
        print(f"Removed mask file: {mask_path}")
    else:
        print(f"Mask file not found: {mask_path}")

    # Remove the frame file if it exists
    if os.path.exists(frame_path):
        os.remove(frame_path)
        print(f"Removed frame file: {frame_path}")
    else:
        print(f"Frame file not found: {frame_path}")


Removed mask file: ..\Datasets\m2caiSeg\train\Masks\fat\30750.png
Removed frame file: ..\Datasets\m2caiSeg\train\Frames\30750.jpg
Removed mask file: ..\Datasets\m2caiSeg\train\Masks\fat\4175.png
Removed frame file: ..\Datasets\m2caiSeg\train\Frames\4175.jpg
Removed mask file: ..\Datasets\m2caiSeg\train\Masks\fat\5150.png
Removed frame file: ..\Datasets\m2caiSeg\train\Frames\5150.jpg
Removed mask file: ..\Datasets\m2caiSeg\train\Masks\grasper\4175.png
Frame file not found: ..\Datasets\m2caiSeg\train\Frames\4175.jpg
Removed mask file: ..\Datasets\m2caiSeg\train\Masks\hook\30750.png
Frame file not found: ..\Datasets\m2caiSeg\train\Frames\30750.jpg
Removed mask file: ..\Datasets\m2caiSeg\train\Masks\hook\4175.png
Frame file not found: ..\Datasets\m2caiSeg\train\Frames\4175.jpg
Removed mask file: ..\Datasets\m2caiSeg\train\Masks\hook\5150.png
Frame file not found: ..\Datasets\m2caiSeg\train\Frames\5150.jpg
Removed mask file: ..\Datasets\m2caiSeg\train\Masks\intestine\5150.png
Frame file not