In [1]:
import numpy as np
import skimage.io as io
import imageio
import os
import shutil
from PIL import Image

### Restructuring the dataset

For MEDIAR to work with the provided images, we need to alter the folder-structure and transform some of the files. 

In [3]:
!mkdir Datasets
!mkdir Datasets/images
!mkdir Datasets/labels

In [4]:
data_dir = "YeaZ_universal_images_and_masks"
img_dest_dir = "Datasets/images"
label_dest_dir = "Datasets/labels"

### Validate 1-1 relationship

For each image (annotated '_im.tif') we should have exactly one corresponding mask/label (annotated '_mask.tif').

In [5]:
import os

def find_unmatched_files(directory):
    im_files = set()
    mask_files = set()

    for dirpath, dirnames, filenames in os.walk(directory):
        for file in filenames:
            if file.endswith('_im.tif'):
                base_name = file.replace('_im.tif', '')
                full_base_name = os.path.join(dirpath, base_name).replace(directory + os.sep, '')
                im_files.add(full_base_name)
            elif file.endswith('_mask.tif'):
                base_name = file.replace('_mask.tif', '')
                full_base_name = os.path.join(dirpath, base_name).replace(directory + os.sep, '')
                mask_files.add(full_base_name)

    # Find unmatched files
    unmatched_im = im_files - mask_files
    unmatched_mask = mask_files - im_files

    return unmatched_im, unmatched_mask


In [6]:
unmatched_im, unmatched_mask = find_unmatched_files(data_dir)

print("Images without corresponding masks:", unmatched_im)
print("Masks without corresponding images:", unmatched_mask)

Images without corresponding masks: set()
Masks without corresponding images: set()


**Comment:** we see that the set of images is exactly the same as the set of labels, meaning that we have 1-1 relationship

### Shapes of Images and Corresponind Masks

Since we're doing splitting on stacked images, we need to verify that for each stacked image, the corresponding mask has the same amount of frames. Otherwise, we won't have a 1-1 relationship between the resulting image-frames and label-frames.

In [7]:
import os
from PIL import Image

def compare_frame_counts_original_dataset(data_dir):
    unmatched_frame_counts = []

    for dirpath, dirnames, filenames in os.walk(data_dir):
        for filename in filenames:
            if filename.endswith('_im.tif'):
                base_filename = filename.replace('_im.tif', '')
                img_path = os.path.join(dirpath, filename)
                mask_filename = base_filename + '_mask.tif'
                mask_path = os.path.join(dirpath, mask_filename)

                if os.path.exists(mask_path):
                    with Image.open(img_path) as img, Image.open(mask_path) as mask:
                        if img.n_frames != mask.n_frames:
                            unmatched_frame_counts.append((filename, img.n_frames, mask_filename, mask.n_frames))

    return unmatched_frame_counts

compare_frame_counts_original_dataset(data_dir)

[]

**Comment:** we observe that unmatched_frame_counts is an empty list. Hence, we have the same number of frames for each image, and its corresponding label.

### Reorganizing Dataset and Splitting Frames

Some of the images in the dataset has multiple layers. MEDIAR is not working properly for such images, hence the need to split these images into separate frames. The model is also setup to work with a specific file-structure, meaning that we have to reorganize our dataset for it to work with our files.

In [8]:
def split_tiff_stack(input_file, output_folder, base_filename, is_mask):
    with Image.open(input_file) as img:
        for i in range(img.n_frames):
            img.seek(i)
            frame_filename = f"{base_filename}_frame_{i}.tiff"
            
            if is_mask:
                frame_filename = frame_filename.replace(".tiff", "_label.tiff")
                
            output_file = os.path.join(output_folder, frame_filename)
            img.save(output_file)

In [9]:
!mkdir test

input_file_image = 'YeaZ_universal_images_and_masks/budding-PhC/KS-K-cropped/Schmoller_crop_1_im.tif'
input_file_label = 'YeaZ_universal_images_and_masks/budding-PhC/KS-K-cropped/Schmoller_crop_1_mask.tif'
output_folder = 'test'
base_filename = 'Schmoller_crop_1'

split_tiff_stack(input_file_image, output_folder, base_filename, False)
split_tiff_stack(input_file_label, output_folder, base_filename, True)

**Comment:** the test verifies that the function is working properly. Now, we can reorganize the dataset

In [10]:
for dirpath, dirnames, filenames in os.walk(data_dir):
    for f in filenames:
        src_file = os.path.join(dirpath, f)
        
        if f.endswith('im.tif'):
            new_f = f.replace("_im", "").replace("tif", "tiff")
            new_file_path = os.path.join(img_dest_dir, new_f)
            with Image.open(src_file) as img:
                if img.n_frames > 1:
                    split_tiff_stack(src_file, img_dest_dir, os.path.splitext(new_f)[0], False)
                else:
                    shutil.copyfile(src_file, new_file_path)
                    
        elif f.endswith('mask.tif'):
            new_f = f.replace("_mask", "_label").replace("tif", "tiff")
            new_f2 = f.replace("_mask", "").replace("tif", "tiff")
            new_file_path = os.path.join(label_dest_dir, new_f)
            with Image.open(src_file) as img:
                if img.n_frames > 1:
                    split_tiff_stack(src_file, label_dest_dir, os.path.splitext(new_f2)[0], True)
                else:
                    shutil.copyfile(src_file, new_file_path)

In [11]:
len(os.listdir(img_dest_dir)), len(os.listdir(label_dest_dir))

(2914, 2914)