# Dataset Creation
This notebook will create the dataset used to train, validate and test all the models.

### Starting Data
1. Stripy Fruits/Vegetables (22)
2. Stripy Phanotm and Random Objects (12)
3. Data from Previous Project (66)

### Pipeline
1. Use `interactive_cropper.py` to generate a json file with the approximate coordinates of the object inside the entire DCM picture for all images
2. Create a `Cropped_Dataset` with all the images at their native resolution
3. De-stripe the stripy fruits/vegetables images from the `Cropped_Dataset` using the method from the paper "Removing Stripes, Scratches, and Curtaining with Non-Recoverable Compressed Sensing". This is done in a separate Notebook: `Removing_Artifacts.ipynb`.
4. Create a dataset for each resolution called `Unaugmented_Final_Dataset` at `2048x2048` size for the following 100 images:
   1. The 22 cropped destriped fruits/vegetables
   2. The 12 cropped stripy phantoms and random objects
   3. The 66 cropped data from the previous project
5. Create the final train/val/test splits for the following resolutions: `Res = [512, 1024, 2048]`:
   1. Three 15 image folders for test data (`Final_Test_Dataset_<Res>`)
   2. Three 340 image folders for the training/validation (`Final_Train_Val_Dataset_<Res>`) using the following transformations:
      - Resized Cropping
      - Horizontal/Vertical Flipping
      - Rotations
      - Translations
      

In [2]:
import os
import shutil
import random
from PIL import Image
from tqdm import tqdm
import numpy as np
import time
import glob
import matplotlib.pyplot as plt
import json
import pydicom
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
from torchvision import transforms
import torch.nn.functional as F
from torch.autograd import Variable
import itertools
import cv2

In [4]:
# Point this to the folder with your ORIGINAL, full-size images.
DICOM_FOLDER = 'C:/Users/emanu/OneDrive - University of Cape Town/EEE4022S/LODOX_Scans' 

# Point this to the JSON file created by your interactive_cropper.py.
COORDINATES_FILE = 'cropping_coordinates.json'

# A new folder where the unpadded, square images will be saved.
DESTINATION_FOLDER = 'C:/Users/emanu/OneDrive - University of Cape Town/EEE4022S/Data/Cropped_Dataset_2048x2048'

SIZE = 2048
# --------------------

os.makedirs(DESTINATION_FOLDER, exist_ok=True)

def crop_to_square(image, start_x, start_y, end_x, end_y):
    """
    Crops an image to a square region based on a bounding box, without adding padding.

    Args:
        image (PIL.Image): The original image.
        start_x, start_y, end_x, end_y (int): Bounding box coordinates.

    Returns:
        PIL.Image: The squarely cropped image.
    """
    # Calculate the width and height of the bounding box.
    box_width = end_x - start_x
    box_height = end_y - start_y
    
    # Find the center of the bounding box.
    center_x = start_x + box_width // 2
    center_y = start_y + box_height // 2
    
    # Determine the half-length of the new square's side (using the longest side of the box).
    half_side = max(box_width, box_height) // 2
    
    # Calculate the new square's coordinates.
    left = center_x - half_side
    top = center_y - half_side
    right = center_x + half_side
    bottom = center_y + half_side
    
    # Crop the image to the new square dimensions.
    cropped_image = image.crop((left, top, right, bottom))
    
    return cropped_image

# Load the cropping coordinates from your JSON file.
with open(COORDINATES_FILE, 'r') as f:
    cropping_coordinates = json.load(f)

all_files = [f for f in os.listdir(DICOM_FOLDER) if f.lower().endswith('.dcm')]
print(f"Found {len(all_files)} .dcm files to process.")
print(f"Saving new square crops to: {DESTINATION_FOLDER}")

# Process each image.
for filename in tqdm(all_files, desc="Creating Square Crops"):
    source_path = os.path.join(DICOM_FOLDER, filename)
    
    # Check if coordinates exist for this file before processing.
    if filename in cropping_coordinates:
        coords = cropping_coordinates[filename]
        dicom_file = pydicom.dcmread(source_path)
        pixel_array = dicom_file.pixel_array.astype(np.float32)
        image_255 = ((pixel_array - pixel_array.min()) / (pixel_array.max() - pixel_array.min()) * 255).astype(np.uint8)
        img = Image.fromarray(image_255)

        square_crop = crop_to_square(img, 
                                     coords['start_x'], coords['start_y'],
                                     coords['end_x'], coords['end_y'])
        final_image = square_crop.resize((SIZE, SIZE), Image.Resampling.LANCZOS)    
        base_name = os.path.splitext(filename)[0]
        save_path = os.path.join(DESTINATION_FOLDER, f"{base_name}.png")
        final_image.save(save_path)
        # square_crop.save(save_path) # for full res images

    else:
        print(f"Warning: No coordinates found for {filename} in {COORDINATES_FILE}. Skipping file.")

print("\nFolder with cropped images created")

Found 100 .dcm files to process.
Saving new square crops to: C:/Users/emanu/OneDrive - University of Cape Town/EEE4022S/Data/Cropped_Dataset_2048x2048


Creating Square Crops: 100%|█████████████████████████████████████████████████████████| 100/100 [00:42<00:00,  2.37it/s]


Folder with cropped images created





In [4]:
# --- Configuration ---

# 1. Point this to the folder containing your 34 clean, square, unpadded .png images from before
SOURCE_IMAGES_DIR = 'C:/Users/emanu/OneDrive - University of Cape Town/EEE4022S/Data/Unaugmented_Full_Dataset_2048x2048'

# 3. --- Define a list of all the resolutions you want to generate ---
TARGET_SIZES = [512, 1024, 2048]

# 4. Define the destination directories.
BASE_TRAIN_VAL_DIR = 'C:/Users/emanu/OneDrive - University of Cape Town/EEE4022S/Data/Final/Train_Val'
BASE_TEST_DIR = 'C:/Users/emanu/OneDrive - University of Cape Town/EEE4022S/Data/Final/Test'

# 5. Set the parameters for the split and augmentation.
TEST_SPLIT_RATIO = 0.15
TARGET_TRAIN_VAL_SIZE_PER_SET = 300 # The approximate number of images you want in your final train/val set.

# --------------------

# --- Create destination directories if they don't exist ---
# os.makedirs(BASETRAIN_VAL_DIR, exist_ok=True)
# os.makedirs(TEST_DIR, exist_ok=True)
train_val_folders = {}
test_folders = {}
for size in TARGET_SIZES:
    # Create and store paths for train/val folders
    train_val_path = f"{BASE_TRAIN_VAL_DIR}_{size}x{size}"
    os.makedirs(train_val_path, exist_ok=True)
    train_val_folders[size] = train_val_path
    
    # Create and store paths for test folders
    test_path = f"{BASE_TEST_DIR}_{size}x{size}"
    os.makedirs(test_path, exist_ok=True)
    test_folders[size] = test_path
    
    print(f"Train/Val folder for {size}x{size}: {train_val_path}")
    print(f"Test folder for {size}x{size}: {test_path}")
# ------------------------------------


# --- 1. Split the source files ---
all_files = [f for f in os.listdir(SOURCE_IMAGES_DIR) if f.lower().endswith('.png')]
random.shuffle(all_files) # Shuffle the files for a random split.

test_set_size = int(len(all_files) * TEST_SPLIT_RATIO)
test_files = all_files[:test_set_size]
train_val_files = all_files[test_set_size:]

print(f"Total source images: {len(all_files)}")
print(f"Splitting into:")
print(f"  - Test set: {len(test_files)} images")
print(f"  - Train/Val set for augmentation: {len(train_val_files)} images")


# --- 2. Create the Multi-Resolution Test Sets ---
print(f"\nCreating multi-resolution test sets...")
for filename in tqdm(test_files, desc="Creating Test Sets"):
    source_path = os.path.join(SOURCE_IMAGES_DIR, filename)
    image = Image.open(source_path)
    
    # Loop through target sizes, resize, and save to the correct test folder.
    for size in TARGET_SIZES:
        resized_image = image.resize((size, size), Image.Resampling.LANCZOS)
        destination_path = os.path.join(test_folders[size], filename)
        resized_image.save(destination_path)
print("Test sets created successfully.")

# --- 3. Create the Multi-Resolution Augmented Train/Val Sets ---

# Define the set of augmentations.
augmentation_transform = transforms.Compose([
    transforms.RandomResizedCrop(size=(2048, 2048), scale=(0.3, 1.0), ratio=(1.0, 1.0)),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomVerticalFlip(p=0.5),
    transforms.RandomApply([transforms.RandomRotation(degrees=20)], p=0.7),
    transforms.RandomApply([transforms.RandomAffine(degrees=0, translate=(0.15, 0.15))], p=0.5),
    transforms.ColorJitter(brightness=0.2, contrast=0.2),
])

if len(train_val_files) > 0:
    num_versions_per_image = int(np.ceil(TARGET_TRAIN_VAL_SIZE_PER_SET / len(train_val_files)))
    print(f"\nGenerating {num_versions_per_image} augmented versions per image for each resolution set...")

    for filename in tqdm(train_val_files, desc="Augmenting Train/Val Sets"):
        source_path = os.path.join(SOURCE_IMAGES_DIR, filename)
        original_image = Image.open(source_path).convert('L')
        base_name, extension = os.path.splitext(filename)

        for i in range(num_versions_per_image):
            # Apply the random transformations to the original image.
            augmented_image = augmentation_transform(original_image)
            
            new_filename = f"{base_name}_aug_{i}{extension}"

            # Now, resize and save this one augmented image to all target resolution folders.
            for size in TARGET_SIZES:
                # Resize the augmented image to the current target size.
                final_image = augmented_image.resize((size, size), Image.Resampling.LANCZOS)
                
                # Save to the correct destination folder.
                destination_path = os.path.join(train_val_folders[size], new_filename)
                final_image.save(destination_path)
else:
    print("\nNo files in the training/validation set to augment.")

print("\n--- Multi-Resolution Offline Augmentation Complete ---")
for size in TARGET_SIZES:
    print(f"Total images in {size}x{size} train/val set: {len(os.listdir(train_val_folders[size]))}")

Train/Val folder for 512x512: C:/Users/emanu/OneDrive - University of Cape Town/EEE4022S/Data/Final/Train_Val_512x512
Test folder for 512x512: C:/Users/emanu/OneDrive - University of Cape Town/EEE4022S/Data/Final/Test_512x512
Train/Val folder for 1024x1024: C:/Users/emanu/OneDrive - University of Cape Town/EEE4022S/Data/Final/Train_Val_1024x1024
Test folder for 1024x1024: C:/Users/emanu/OneDrive - University of Cape Town/EEE4022S/Data/Final/Test_1024x1024
Train/Val folder for 2048x2048: C:/Users/emanu/OneDrive - University of Cape Town/EEE4022S/Data/Final/Train_Val_2048x2048
Test folder for 2048x2048: C:/Users/emanu/OneDrive - University of Cape Town/EEE4022S/Data/Final/Test_2048x2048
Total source images: 100
Splitting into:
  - Test set: 15 images
  - Train/Val set for augmentation: 85 images

Creating multi-resolution test sets...


Creating Test Sets: 100%|██████████████████████████████████████████████████████████████| 15/15 [00:06<00:00,  2.31it/s]


Test sets created successfully.

Generating 4 augmented versions per image for each resolution set...


Augmenting Train/Val Sets: 100%|███████████████████████████████████████████████████████| 85/85 [02:21<00:00,  1.67s/it]


--- Multi-Resolution Offline Augmentation Complete ---
Total images in 512x512 train/val set: 340
Total images in 1024x1024 train/val set: 340
Total images in 2048x2048 train/val set: 340





In [2]:
import os
from PIL import Image
from tqdm.notebook import tqdm # Use tqdm.notebook for a nice progress bar in Jupyter

# --- 1. Configuration: SET YOUR FOLDERS HERE ---
INPUT_FOLDER = 'C:/Users/emanu/OneDrive - University of Cape Town/EEE4022S/Data/Final/NRCS_In'
OUTPUT_FOLDER = 'C:/Users/emanu/OneDrive - University of Cape Town/EEE4022S/Data/DIP_Destripe_In'
TARGET_RESOLUTION = (512, 512)

# --- 2. Create the output directory if it doesn't exist ---
if not os.path.exists(OUTPUT_FOLDER):
    os.makedirs(OUTPUT_FOLDER)
    print(f"Created output directory: {OUTPUT_FOLDER}")

# --- 3. Find and filter all image files ---
files = os.listdir(INPUT_FOLDER)
image_files = [f for f in files if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif'))]

if not image_files:
    print(f"No images found in '{INPUT_FOLDER}'. Please check the path.")
else:
    print(f"Found {len(image_files)} images to resize.")

    # --- 4. Loop through images, resize, and save ---
    for filename in tqdm(image_files, desc="Resizing Images"):
        try:
            # Construct full file paths
            input_path = os.path.join(INPUT_FOLDER, filename)
            output_path = os.path.join(OUTPUT_FOLDER, filename)

            # Open, resize, and save the image
            with Image.open(input_path) as img:
                # Image.Resampling.LANCZOS is a high-quality filter for downscaling
                resized_img = img.resize(TARGET_RESOLUTION, Image.Resampling.LANCZOS)
                resized_img.save(output_path)

        except Exception as e:
            print(f"Could not process {filename}. Reason: {e}")

    print(f"\nImage resizing complete! Resized images are saved in: {OUTPUT_FOLDER}")

Created output directory: C:/Users/emanu/OneDrive - University of Cape Town/EEE4022S/Data/DIP_Destripe_In
Found 3 images to resize.


Resizing Images:   0%|          | 0/3 [00:00<?, ?it/s]


Image resizing complete! Resized images are saved in: C:/Users/emanu/OneDrive - University of Cape Town/EEE4022S/Data/DIP_Destripe_In
