In [None]:
# from PIL import Image
# import os

# input_base_folder = 'CRC-Dataset'
# output_base_folder = 'CRC-Dataset_resized'

# # List of class folders
# class_folders = os.listdir(input_base_folder)

# # Iterate through class folders
# for class_folder in class_folders:
#     input_folder = os.path.join(input_base_folder, class_folder)
#     output_folder = os.path.join(output_base_folder, class_folder)

#     # Create the output folder if it doesn't exist
#     os.makedirs(output_folder, exist_ok=True)

#     # Resize images and save them to the output folder
#     for filename in os.listdir(input_folder):
#         image_path = os.path.join(input_folder, filename)

#         # Load the image using PIL
#         image = Image.open(image_path)

#         # Resize the image to your desired size (e.g., 224x224)
#         target_size = (224, 224)
#         image = image.resize(target_size)

#         # Save the resized image to the output folder
#         save_path = os.path.join(output_folder, filename)
#         image.save(save_path)

# print("Resized images done")


Resized images done


In [11]:
from PIL import Image
import numpy as np
import os
from skimage.restoration import denoise_nl_means, estimate_sigma
from skimage import img_as_ubyte

# Path to the dataset
dataset_dir = "CRC-Dataset"

# Detect classes automatically
classes = [class_name for class_name in os.listdir(dataset_dir) if os.path.isdir(os.path.join(dataset_dir, class_name))]

# Define NLM filter function (using scikit-image)
def nlm_filter(image):
    # Convert to float [0,1]
    image = image.astype(np.float32) / 255.0
    sigma_est = np.mean(estimate_sigma(image, channel_axis=-1)) 
    patch_kw = dict(patch_size=5, patch_distance=6, channel_axis=-1) 
    denoised = denoise_nl_means(image, h=1.15 * sigma_est, fast_mode=True, **patch_kw)
    return img_as_ubyte(denoised)  # back to 8-bit

# Iterate through each class
for class_name in classes:
    class_dir = os.path.join(dataset_dir, class_name)
    output_dir = os.path.join("CRC-Dataset_NLM", class_name, f"{class_name}_nlm")
    os.makedirs(output_dir, exist_ok=True)

    # Accept tif, tiff, bmp, jpg, png
    image_files = [
        os.path.join(class_dir, filename)
        for filename in os.listdir(class_dir)
        if filename.lower().endswith((".tif"))
    ]

    for image_file in image_files:
        # Load with Pillow
        image = Image.open(image_file).convert("RGB")
        image_np = np.array(image)

        # Apply NLM filter
        filtered_image = nlm_filter(image_np)

        # Save as PNG
        filename = os.path.splitext(os.path.basename(image_file))[0]
        output_file = os.path.join(output_dir, f"{filename}.png")
        Image.fromarray(filtered_image).save(output_file)

print("Noise removal and PNG conversion completed.")


Noise removal and PNG conversion completed.


In [1]:
import os
import numpy as np
from PIL import Image
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array

# ---- Config ----
base_input_folder = 'CRC-Dataset_NLM'
base_output_folder = 'CRC-NLM_Augmentation'
IMG_SIZE = (224, 224)
AUG_PER_IMAGE = 4
USE_VERTICAL_FLIP = True

# Accept common formats
ALLOWED_EXTS = ('.png',) 

class_folders = [
    f for f in os.listdir(base_input_folder)
    if os.path.isdir(os.path.join(base_input_folder, f))
]

datagen = ImageDataGenerator(
    rotation_range=15,
    width_shift_range=0.05,
    height_shift_range=0.05,
    shear_range=0.05,
    zoom_range=0.1,
    rescale=1./255,
    horizontal_flip=True,
    vertical_flip=USE_VERTICAL_FLIP,
    fill_mode='reflect'
)

os.makedirs(base_output_folder, exist_ok=True)

total_in = 0
total_out = 0

for class_folder in class_folders:
    input_folder = os.path.join(base_input_folder, class_folder)
    output_folder = os.path.join(base_output_folder, class_folder)
    os.makedirs(output_folder, exist_ok=True)

    # Collect valid input files
    files = [fn for fn in os.listdir(input_folder)
             if fn.lower().endswith(ALLOWED_EXTS)]

    if not files:
        print(f"[WARN] No images found for class '{class_folder}' in {input_folder} "
              f"(allowed: {ALLOWED_EXTS}).")
        continue

    print(f"[INFO] Class '{class_folder}': {len(files)} source images.")

    for filename in files:
        in_path = os.path.join(input_folder, filename)
        total_in += 1
        try:
            # Load with consistent mode/size
            img = load_img(in_path, color_mode='rgb', target_size=IMG_SIZE)
            arr = img_to_array(img)
            arr = np.expand_dims(arr, axis=0)  # (1, H, W, C)

            # Create generator (1 at a time, avoids waste)
            flow = datagen.flow(arr, batch_size=1)

            stem = os.path.splitext(filename)[0]
            for i in range(AUG_PER_IMAGE):
                batch = next(flow)  # (1, H, W, C), scaled [0,1]
                aug = (batch[0] * 255).astype('uint8')  # back to uint8 for saving
                out_img = Image.fromarray(aug)
                out_path = os.path.join(output_folder, f'{stem}_aug_{i}.png')
                out_img.save(out_path)
                total_out += 1

        except Exception as e:
            print(f'[WARN] Skipping {in_path}: {e}')

print(f"Done. Inputs processed: {total_in}, augmented images saved: {total_out}.")


[INFO] Class 'ADI_nlm': 1338 source images.
[INFO] Class 'BACK_nlm': 847 source images.
[INFO] Class 'DEB_nlm': 339 source images.
[INFO] Class 'LYM_nlm': 634 source images.
[INFO] Class 'MUC_nlm': 1035 source images.
[INFO] Class 'MUS_nlm': 592 source images.
[INFO] Class 'NORM_nlm': 741 source images.
[INFO] Class 'STR_nlm': 421 source images.
[INFO] Class 'TUM_nlm': 1233 source images.
Done. Inputs processed: 7180, augmented images saved: 28720.


In [8]:
import os
import random

def reduce_images(folder_path, target_count):
    # List all files in the folder
    all_files = [f for f in os.listdir(folder_path) 
                 if os.path.isfile(os.path.join(folder_path, f))]
    
    current_count = len(all_files)
    
    if current_count <= target_count:
        print(f"No deletion needed. Current images: {current_count}")
        return
    
    # Calculate how many to delete
    delete_count = current_count - target_count
    
    # Randomly select files to delete
    files_to_delete = random.sample(all_files, delete_count)
    
    for f in files_to_delete:
        os.remove(os.path.join(folder_path, f))
    
    print(f"Deleted {delete_count} images. Remaining: {target_count}")


reduce_images("CRC-NLM_Augmentation/TUM_nlm", 2200)


Deleted 2732 images. Remaining: 2200


In [None]:
import os
import shutil

# Define the directories for train and test datasets
train_dir = 'New NLM Augmentation/train'  # Replace with your train dataset directory
test_dir = 'New NLM Augmentation/test'    # Replace with your test dataset directory
output_dir = 'NEW_DATASET_SIZE'  # Replace with the directory where you want to store the merged dataset

# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# List the classes in the train dataset (assuming each subdirectory corresponds to a class)
classes = os.listdir(train_dir)

# Merge train and test datasets by copying files to the output directory
for class_name in classes:
    train_class_dir = os.path.join(train_dir, class_name)
    test_class_dir = os.path.join(test_dir, class_name)
    output_class_dir = os.path.join(output_dir, class_name)
    
    # Create the class directory in the merged dataset if it doesn't exist
    os.makedirs(output_class_dir, exist_ok=True)
    
    # Copy images from the train dataset
    train_files = os.listdir(train_class_dir)
    for file in train_files:
        src_path = os.path.join(train_class_dir, file)
        dst_path = os.path.join(output_class_dir, file)
        shutil.copy(src_path, dst_path)
    
    # Copy images from the test dataset
    test_files = os.listdir(test_class_dir)
    for file in test_files:
        src_path = os.path.join(test_class_dir, file)
        dst_path = os.path.join(output_class_dir, file)
        shutil.copy(src_path, dst_path)

print("Merging complete.")


In [12]:
import os
import shutil
import random

# Set your data directory
data_dir = 'NEW_DATASET_SIZE'  # Replace with the path to your Herlev dataset

# Define the directory names for the splits
train_dir = 'NEW_DATASET/train'
val_dir = 'NEW_DATASET/validation'
test_dir = 'NEW_DATASET/test'

# Create the directories if they don't exist
os.makedirs(train_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

# Define the ratio for splitting (e.g., 70% train, 15% validation, 15% test)
train_ratio = 0.7
val_ratio = 0.18
test_ratio = 0.12

# Loop through each class in the dataset
for class_name in os.listdir(data_dir):
    class_dir = os.path.join(data_dir, class_name)
    if os.path.isdir(class_dir):
        # List all the files in the class directory
        files = os.listdir(class_dir)
        random.shuffle(files)  # Shuffle the files
        
        num_files = len(files)
        
        # Calculate the number of samples for each split
        num_train = int(train_ratio * num_files)
        num_val = int(val_ratio * num_files)
        num_test = num_files - num_train - num_val
        
        # Create subdirectories for each split
        class_train_dir = os.path.join(train_dir, class_name)
        class_val_dir = os.path.join(val_dir, class_name)
        class_test_dir = os.path.join(test_dir, class_name)
        
        os.makedirs(class_train_dir, exist_ok=True)
        os.makedirs(class_val_dir, exist_ok=True)
        os.makedirs(class_test_dir, exist_ok=True)
        
        # Copy files to respective splits
        for i, file in enumerate(files):
            src_path = os.path.join(class_dir, file)
            if i < num_train:
                dst_path = os.path.join(class_train_dir, file)
            elif i < num_train + num_val:
                dst_path = os.path.join(class_val_dir, file)
            else:
                dst_path = os.path.join(class_test_dir, file)
            
            shutil.copy(src_path, dst_path)

print("Herlev dataset split into train, validation, and test sets.")


Herlev dataset split into train, validation, and test sets.
