# Load the necessary packages

In [6]:
import numpy as np
# import cv2
# import pandas as pd

import skimage.io as ski
import os
# from sklearn.datasets import load_sample_image
# from sklearn.feature_extraction import image
# from patchify import patchify
import tifffile as tiff
from pathlib import Path
from tqdm import tqdm
from Patches_lib import generate_patches, Image_augmentation

# import tensorflow as tf
# import keras
# from keras import layers
# from tensorflow.keras.preprocessing.image import ImageDataGenerator

import matplotlib.pyplot as plt
# from PIL import Image


# Load images and create and save patches

The large images of size 2856×4096 with the same acquisition conditions are pre-processed and divided into smaller patches of $256 \times 256$ pixels.  Randomly selected patches were used for testing purposes later.

<img src="Figure/Patchings.svg"/>

## Load images

- `patch_size`  
  This defines the size (in pixels) of square patches that the input images are divided into.  
  For example, if `patch_size` is $256$, each image will be processed in blocks of $256 \times 256$ pixels.  
  This is commonly used in patch-based training for deep learning models, particularly in image segmentation tasks.

- `Number_testing_images`  
  This indicates the total number of images used for testing the model's performance.  
  These images are not used during training or validation and are reserved exclusively to evaluate generalization.

- `Number_augmented_images`  
  This specifies the number of augmented versions generated per original image.  
  Data augmentation helps improve model robustness by artificially increasing the training dataset through transformations such as rotations, flips, or brightness changes.

In [7]:
# Stock images and labels name:
Data_folder = 'Data'
patch_size = 256
Number_testing_images   = 20
Number_augmented_images = 10

All_Images_names = sorted(list(Path(Data_folder + '/SEM Image/').glob('*.tif')))
All_Labels_names = sorted(list(Path(Data_folder + '/Labels/').glob('*.tif')))

print(len(All_Images_names), 'Images have been detected and loaded.')

7 Images have been detected and loaded.


## Create and save patches

In [8]:
# Necessary functions to create and save patches
def create_directory(path):
    if not os.path.exists(path):
        os.makedirs(path)
        print(f"Directory created: {path}")
    else:
        print(f"Directory already exists: {path}")

def save_patch(image, label, index, image_id, patch_type, patch_size, base_dir):
    image_path = os.path.join(base_dir, f"{patch_type}_Images_{patch_size}", f"Image_{image_id}_{index}.tif")
    label_path = os.path.join(base_dir, f"{patch_type}_Labels_{patch_size}", f"Image_{image_id}_{index}.tif")
    tiff.imwrite(image_path, image)
    tiff.imwrite(label_path, label)

def prepare_data(images, labels, patch_size, base_dir, save_it, num_test=20):
    for image_id, (img_path, lbl_path) in enumerate(zip(images, labels)):

        print(f"Processing Image {image_id + 1}: {img_path}")

        # Read image and label
        image = ski.imread(img_path)
        label = ski.imread(lbl_path)

        # Generate patches (assumes this function is defined elsewhere)
        patches_img, patches_lbl = generate_patches(image, label, patch_size, Type='Seq')

        # Normalize label
        patches_lbl = (patches_lbl / 85).astype(np.uint8)

        # Define output directories
        for folder in ['Training_Images_', 'Training_Labels_', 'Testing_Images_', 'Testing_Labels_']:
            create_directory(os.path.join(base_dir, f"{folder}{patch_size}"))

        num_patches = patches_img.shape[0]
        test_indices = np.random.choice(num_patches, num_test, replace=False)

        for idx in range(num_patches):
            patch_type = "Testing" if idx in test_indices else "Training"
            if save_it != False:
                save_patch(patches_img[idx], patches_lbl[idx], idx, image_id, patch_type, patch_size, base_dir)
        if save_it != False: 
            print('Patches correctly saved')
        else:
            print('Patches are not saved')

# Load the functions
prepare_data(All_Images_names, All_Labels_names, patch_size=patch_size, base_dir= Data_folder, save_it= True, num_test=Number_testing_images)

Processing Image 1: Data/SEM Image/Image_0.tif
Directory created: Data/Training_Images_256
Directory created: Data/Training_Labels_256
Directory created: Data/Testing_Images_256
Directory created: Data/Testing_Labels_256
Patches correctly saved
Processing Image 2: Data/SEM Image/Image_1.tif
Directory already exists: Data/Training_Images_256
Directory already exists: Data/Training_Labels_256
Directory already exists: Data/Testing_Images_256
Directory already exists: Data/Testing_Labels_256
Patches correctly saved
Processing Image 3: Data/SEM Image/Image_2.tif
Directory already exists: Data/Training_Images_256
Directory already exists: Data/Training_Labels_256
Directory already exists: Data/Testing_Images_256
Directory already exists: Data/Testing_Labels_256
Patches correctly saved
Processing Image 4: Data/SEM Image/Image_3.tif
Directory already exists: Data/Training_Images_256
Directory already exists: Data/Training_Labels_256
Directory already exists: Data/Testing_Images_256
Directory 

# Data Augmentation

<img src="Figure/Augmentation.svg" />


In [9]:
def save_image_and_label(image, label, index, aug_index, image_dir, label_dir):
    """Save image and label pair with a specific naming format."""
    image_path = os.path.join(image_dir, f"Image_{index}_{aug_index}.tif")
    label_path = os.path.join(label_dir, f"Image_{index}_{aug_index}.tif")
    tiff.imwrite(image_path, image)
    tiff.imwrite(label_path, label)

def augment_dataset(data_dir, patch_size, augment_fn, total_images_per_sample=10):
    image_dir = Path(data_dir) / f'Training_Images_{patch_size}'
    label_dir = Path(data_dir) / f'Training_Labels_{patch_size}'
    
    all_images = sorted(image_dir.glob('*.tif'))
    all_labels = sorted(label_dir.glob('*.tif'))

    # Only generate (total_images_per_sample - 1) augmentations since original is included
    num_augmentations = total_images_per_sample - 1
    total_samples = len(all_images) * total_images_per_sample

    aug_image_dir = Path(data_dir) / f'Augmented_Images_{patch_size}'
    aug_label_dir = Path(data_dir) / f'Augmented_Labels_{patch_size}'

    create_directory(aug_image_dir)
    create_directory(aug_label_dir)

    with tqdm(total=total_samples, desc='Data Augmentation') as pbar:
        for idx, (img_path, lbl_path) in enumerate(zip(all_images, all_labels)):
            image = ski.imread(img_path)
            label = ski.imread(lbl_path)

            # Save original image and label
            save_image_and_label(image, label, idx, 0, aug_image_dir, aug_label_dir)
            pbar.update(1)

            # Generate n-1 augmentations
            aug_images, aug_labels = augment_fn(image, [image, label], num_transformations=num_augmentations)

            for aug_idx in range(num_augmentations):
                save_image_and_label(aug_images[aug_idx], aug_labels[aug_idx], idx, aug_idx + 1, aug_image_dir, aug_label_dir)
                pbar.update(1)
                

augment_dataset(data_dir=Data_folder, patch_size=patch_size, augment_fn=Image_augmentation, total_images_per_sample=Number_augmented_images)

Directory created: Data/Augmented_Images_256
Directory created: Data/Augmented_Labels_256


  validated_self = self.__pydantic_validator__.validate_python(data, self_instance=self)
  result = func(img, *args, **kwargs)
Data Augmentation: 100%|██████████████████████████████████████████████████████████| 10920/10920 [04:13<00:00, 43.08it/s]
