In [32]:
import kagglehub

path = kagglehub.dataset_download("tarunpathak/natural-images-with-synthetic-noise")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'natural-images-with-synthetic-noise' dataset.
Path to dataset files: /kaggle/input/natural-images-with-synthetic-noise


In [33]:
from pathlib import Path
from PIL import Image
from typing import Optional, Tuple
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import os

#Creates a custom dataset by inheriting from PyTorch’s Dataset class.

class PairedNISN(Dataset):
    """
    Expects:
      <split>/
        noisy images/*.jpg
        ground truth/*.jpg
    Example split paths: /content/NISN/train/train, /content/NISN/validate/validate, /content/NISN/test/test
    Pairs by removing the noise prefix before the first '_' in the noisy filename.
    """

    def __init__(self, split_root: str, resize_to: Optional[Tuple[int,int]] = None):

      #Optional[Tuple[int,int]] = None:  Resize target or keep original
        self.root = Path(split_root)
        self.noisy_dir = self.root / "noisy images"   # → /content/NISN/train/train/noisy images
        self.clean_dir = self.root / "ground truth"   # → /content/NISN/train/train/ground truth

        #Path(split_root) converts the string (like "/content/NISN/train/train") into a Path object.
        #.is_dir() → checks if the path points to a directory
        #.is_file() → checks if the path points to a file
        #.exists() → checks if the path exists at all
        assert self.noisy_dir.is_dir() and self.clean_dir.is_dir(), "NISN folders not found."

        #In Java (similar idea) for condition checking:
        #assert noisyDir.exists() && cleanDir.exists() : "NISN folders not found.";
        # transforms: ToTensor() only -> values in [0,1]
        t = []

        #Checks if the user provided a resize target when creating the dataset.
        if resize_to is not None:
            t.append(transforms.Resize(resize_to))
        t.append(transforms.ToTensor())

        """
        transforms.ToTensor() does three things:
        Converts the image from a PIL object (used by Pillow) to a PyTorch tensor.

        Reorders the dimensions from (H, W, C) → (C, H, W) (channel-first).

        Scales pixel values from [0, 255] → [0, 1] (float32).
        """
        #So now our list might be: t = [transforms.Resize((32, 32)), transforms.ToTensor()]
        #or just : t = [transforms.ToTensor()], if if no resize is used.
        self.transform = transforms.Compose(t)

        #This combines all the transformations in the list t into one pipeline using Compose.
        #So self.transform becomes a callable object that you can apply to any image:
        #Ex: img_tensor = self.transform(image)
        #and it will automatically apply each transformation in order.
        # build pairs by stripping the noise tag ('gauss_...')
        #We only use this function on the noisy image filenames, not the clean ones.
        # Ex: gauss_127215712_ff5b654d07_c.jpg -> 127215712_ff5b654d07_c.jpg

        def base_clean_name(name: str) -> str:
            # drop everything up to first underscore
            i = name.find('_')
            return name[i+1:] if i != -1 else name
             # slice after '_' ; if no '_', return the original name
        # support jpg/jpeg/png
        exts = ("*.jpg", "*.jpeg", "*.png")
        noisy_paths = []
        for e in exts:
            noisy_paths.extend(self.noisy_dir.glob(e))
        noisy_paths = sorted(noisy_paths)

        #noisy_paths becomes a list of full file paths to all noisy images, sorted alphabetically.
        """
        [
            Path('/content/NISN/train/train/noisy images/gauss_127215712_ff5b654d07_c.jpg'),
            Path('/content/NISN/train/train/noisy images/gauss_128819241_bb4a3c5f71_c.jpg'),
            ...
            ]
        """

        clean_index = {}
        for e in exts:
            for p in self.clean_dir.glob(e):
                clean_index[p.name] = p

                #p.name → just the filename, e.g. "40321670583_403f81a527_c.jpg".
                #p → the full path, e.g. Path('/content/.../ground truth/40321670583_403f81a527_c.jpg').
        """
        clean_index["40321670583_403f81a527_c.jpg"] = Path('/content/.../ground truth/40321670583_403f81a527_c.jpg')

        or:

        { "40321670583_403f81a527_c.jpg": Path(.../ground truth/40321670583_403f81a527_c.jpg) }

        """
        #self.pairs will be a list of tuples
        """

        Ex: Path('/content/NISN/train/train/noisy images/gauss_127215712_ff5b654d07_c.jpg')

        Each entry is a Path object (from pathlib), not a plain string:

        noisePath = noisy_paths[0]
        print(noisePath)         # full path
        print(noisePath.name)    # filename only, e.g. 'gauss_127215712_ff5b654d07_c.jpg'
        print(noisePath.stem)    # filename without extension, e.g. 'gauss_127215712_ff5b654d07_c'
        print(noisePath.suffix)  # extension only, e.g. '.jpg'
        """

        self.pairs = []
        missing = []

        for noisePath in noisy_paths:
            cleanName = base_clean_name(noisePath.name) # Ex: gauss_127215712_ff5b654d07_c.jpg -> 127215712_ff5b654d07_c.jpg
            cleanPath = clean_index.get(cleanName)     #   clean_index.get(127215712_ff5b654d07_c.jpg)
                                               # use key (file name) to find value
                                               #which returns path of clean image
            if cleanPath is None:
                missing.append((noisePath.name, cleanName))
            else:
                self.pairs.append((noisePath, cleanPath))

                #self.pairs is a list of tuples where each tuple contains
                # the file paths of a noisy–clean image pair with the same base name.

        if missing:
            print(f"[warning] {len(missing)} noisy files had no matching clean image. first few:")
            for n, c in missing[:5]:
                print(f"  noisy: {n}  -> expected: {c}")

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        noisePath, cleanPath = self.pairs[idx]

        noisy = Image.open(noisePath).convert("RGB")
        clean = Image.open(cleanPath).convert("RGB")

        #uses Pillow (PIL) to open the image file from disk.
        #result is a PIL Image object, not yet a tensor
        #.convert("RGB"): This forces the image into RGB color mode.
        #Each channel’s value ranges from 0 to 255 (as integers)
        #Ex: [255, 255, 0]   # R=255, G=255, B=0 → yellow

        return self.transform(noisy), self.transform(clean)
              #turn [255, 255, 0] into a PyTorch tensor (0–1 range), rearranged into channel-first
        """
              Ex:
              noisy image is a small yellow pixel:
              noisy_pixel_RGB = [255, 255, 0]   # R=255, G=255, B=0
              clean image pixel is a darker yellow:
              clean_pixel_RGB = [128, 128, 0]

              After self.tfm(noisy):

              1. Convert to float and scale to [0,1]:
              [255, 255, 0] → [1.0, 1.0, 0.0]
              [128, 128, 0] → [0.502, 0.502, 0.0]

              2. Change shape:
              Before: (H, W, C) → typical image shape like (512, 512, 3)
              After: (C, H, W) → tensor shape (3, 512, 512)

              3. Convert to tensor:
              Now both are torch.Tensor objects of type float32.

         """


In [34]:
resize_to = None          # <- keep native 512x512

# The dataset_name variable is not needed for path construction as per the directory listing.
train_ds = PairedNISN(f"{path}/train/train",    resize_to=resize_to)
val_ds   = PairedNISN(f"{path}/validate/validate", resize_to=resize_to)
test_ds  = PairedNISN(f"{path}/test/test",     resize_to=resize_to)

In [35]:
'''
--------------FOR DEBUGGING--------------
# List contents of the 'train' split folder
train_split_path = f"{path}/train"
print(f"Contents of {train_split_path}:")
for item in os.listdir(train_split_path):
    print(f"- {item}")
'''

'\n--------------FOR DEBUGGING--------------\n# List contents of the \'train\' split folder\ntrain_split_path = f"{path}/train"\nprint(f"Contents of {train_split_path}:")\nfor item in os.listdir(train_split_path):\n    print(f"- {item}")\n'

In [36]:
"""
-----------FOR DEBUGGING-------------
# List contents of the base path where the dataset is extracted
print(f"Contents of {path}:")
for item in os.listdir(path):
    print(f"- {item}")

# If there's a single main dataset folder (e.g., 'natural-images-with-synthetic-noise'), list its contents too
# Assuming dataset_name is correctly set as 'natural-images-with-synthetic-noise'
full_dataset_path = os.path.join(path, dataset_name)
if os.path.isdir(full_dataset_path):
    print(f"\nContents of {full_dataset_path}:")
    for item in os.listdir(full_dataset_path):
        print(f"- {item}")
"""

'\n-----------FOR DEBUGGING-------------\n# List contents of the base path where the dataset is extracted\nprint(f"Contents of {path}:")\nfor item in os.listdir(path):\n    print(f"- {item}")\n\n# If there\'s a single main dataset folder (e.g., \'natural-images-with-synthetic-noise\'), list its contents too\n# Assuming dataset_name is correctly set as \'natural-images-with-synthetic-noise\'\nfull_dataset_path = os.path.join(path, dataset_name)\nif os.path.isdir(full_dataset_path):\n    print(f"\nContents of {full_dataset_path}:")\n    for item in os.listdir(full_dataset_path):\n        print(f"- {item}")\n'

In [37]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# pin is boolean
pin = device.type == "cuda"

# adjust batch_size to your VRAM; 512x512 is large → start small
batch_size = 8 if resize_to is None else 64

# Use 2 or 4 workers to speed up data loading in the background
num_workers = 2 if device.type == "cuda" else 0 # Use 0 if on CPU

train_loader = DataLoader(train_ds,
                          batch_size=batch_size,
                          shuffle=True,
                          pin_memory=pin,
                          num_workers=num_workers)

val_loader   = DataLoader(val_ds,
                          batch_size=batch_size,
                          shuffle=False,
                          pin_memory=pin,
                          num_workers=num_workers)

test_loader  = DataLoader(test_ds,
                          batch_size=batch_size,
                          shuffle=False,
                          pin_memory=pin,
                          num_workers=num_workers)

print("DataLoaders are ready!")

Using device: cpu
DataLoaders are ready!
