For dataset generation, in particular, use numpy-1.24.3, then uninstall/update numpy to latest model possible for image registration

Extracting random sample from class samples to form our dataset


In [6]:
import os
import shutil
import random
from pathlib import Path

import cv2
import numpy as np
from imgaug import augmenters as iaa
from PIL import Image

output and input paths

In [7]:
source_root=Path("D:\\collegeCode\\sem4\\csd212_Project\\data\\raw\\CRC-VAL-HE-7K")

output_dir=Path("D:\\collegeCode\\sem4\\csd212_Project\\data\\sample_HE")

output_dir.mkdir(parents=True, exist_ok=True)

Getting all .tif images from source subfolders and sampling 1000 images randomly, then copying to output folder

In [8]:
all_images=list(source_root.rglob("*.tif"))
print(f"Found {len(all_images)} images.")

sample_size=1000
sampled_images=random.sample(all_images, sample_size)

for i, img_path in enumerate(sampled_images, start=1):
    dest_path=output_dir/f"{i:04d}.tif"
    shutil.copy(img_path, dest_path)

print(f"Copied {sample_size} images to '{output_dir.resolve()}")

Found 7180 images.
Copied 1000 images to 'D:\collegeCode\sem4\csd212_Project\data\sample_HE


Data-PreProcessing
As synthetic data has been generated and is being used, the below pre-processing steps are being skipped as inapplicable
1. Rotation
2. Scaling
3. Translation
4. Patch-based

The following transformations have been applied
1. Grayscale conversion
2. Gaussian smoothing
3. Elastic deformation

In [11]:
#paths
input_dir=Path("D:\\collegeCode\\sem4\\csd212_Project\\data\\sample_HE")
output_root=Path("D:\\collegeCode\\sem4\\csd212_Project\\data\\synthetic_dataset\\train")
output_root.mkdir(parents=True, exist_ok=True)

#elastic deformation operator
elastic=iaa.ElasticTransformation(alpha=40, sigma=6)

#looping through each HE image and processing it
for idx, image_path in enumerate(sorted(input_dir.glob("*.tif"))):
    img=cv2.imread(str(image_path))
    if img is None:
        continue

    img_resized=cv2.resize(img, (512, 384)) #as specified in paper

    hsv=cv2.cvtColor(img_resized, cv2.COLOR_BGR2HSV).astype(np.float32)
    hsv[:, :, 1]*=1.25 #boosts saturation
    hsv[:, :, 2]*=0.9  #reduce brightness
    hsv=np.clip(hsv, 0, 255).astype(np.uint8)
    snapshot=cv2.cvtColor(hsv, cv2.COLOR_HSV2BGR)

    #add gaussian blur
    snapshot=cv2.GaussianBlur(snapshot, (3, 3), 0)
    noise=np.random.normal(0, 5, snapshot.shape).astype(np.uint8)
    snapshot=cv2.add(snapshot, noise)

    #elastic deformation
    snapshot=elastic.augment_image(snapshot)

    #convert both images to grayscale
    snapshot_gray=cv2.cvtColor(snapshot, cv2.COLOR_BGR2GRAY)
    he_gray=cv2.cvtColor(img_resized, cv2.COLOR_BGR2GRAY)

    #applying CLAHE
    clahe=cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
    snapshot_gray=clahe.apply(snapshot_gray)
    he_gray=clahe.apply(he_gray)

    #saving HE and snapshots as PNGs
    sample_dir=output_root/f"sample_{idx:04d}"
    sample_dir.mkdir(parents=True, exist_ok=True)

    cv2.imwrite(str(sample_dir / "he_target.png"), he_gray)
    cv2.imwrite(str(sample_dir / "snapshot.png"), snapshot_gray)

print("Synthetic dataset generated. ")



Synthetic dataset generated. 
