This notebook merges the original Training and Testing MRI folders, resizes all images to 224 x 224, and prepares a clean directory structure for later splitting and augmentation.

It follows the preprocessing methodology described in the LEAD-CNN paper

In [None]:
In this notebook we perform the following preprocessing steps:


1. Merge the original Training and Testing datasets
2. Resize all images to 224 × 224
3. Prepare the cleaned dataset directory structure for later splitting and augmentation


These steps replicate the image standardization stage described in the LEAD‑CNN paper.

We merge both the Training and Testing folders from the original dataset and resize all images before performing any splitting.


This ensures:
- Randomized distribution in later steps
- Uniform input size for CNN models
- Consistency with the methodology of the research paper

In [None]:
# MRI DATASET CLEANING AND MERGING

In [1]:
import cv2
import numpy as np

print("OpenCV version:", cv2.__version__)
print("NumPy version:", np.__version__)

OpenCV version: 4.12.0
NumPy version: 2.2.6


In [2]:
# Core libraries
import os
import shutil
from pathlib import Path


# Image processing
import cv2
import numpy as np


# Utilities
from tqdm import tqdm

In [3]:
# Define dataset paths (Windows friendly)
RAW_TRAIN_DIR = Path(r"..\\data\\raw_data\\Training")
RAW_TEST_DIR = Path(r"..\\data\\raw_data\\Testing")
CLEAN_DIR = Path(r"..\\data\\cleaned_data")

In [4]:
# Configuring Dataset for the inner folders and defining the image size to resize to
CLASSES = ['glioma', 'meningioma', 'notumor', 'pituitary']
IMG_SIZE = (224, 224)


print("Classes:", CLASSES)
print("Target image size:", IMG_SIZE)

Classes: ['glioma', 'meningioma', 'notumor', 'pituitary']
Target image size: (224, 224)


In [5]:
# Defining The Folder Structure for the Cleaned Dataset
for split in ['train', 'val', 'test']:
  for cls in CLASSES:
    path = CLEAN_DIR / split / cls
    path.mkdir(parents=True, exist_ok=True)


print("Folder structure created under:", CLEAN_DIR.resolve())

Folder structure created under: C:\Users\ekowd\Desktop\FYP\FYP\data\cleaned_data


In [6]:
# Dataset Check to ensure folders are created correctly

print("Current working directory:", Path.cwd())
print("RAW_TRAIN_DIR exists:", RAW_TRAIN_DIR.exists(), "->", RAW_TRAIN_DIR.resolve())
print("RAW_TEST_DIR exists:", RAW_TEST_DIR.exists(), "->", RAW_TEST_DIR.resolve())
print("CLEAN_DIR exists:", CLEAN_DIR.exists(), "->", CLEAN_DIR.resolve())


for cls in CLASSES:
  for data_dir in [RAW_TRAIN_DIR, RAW_TEST_DIR]:
    class_dir = data_dir / cls
    if class_dir.exists():
      files = list(class_dir.glob('*'))
      print(f"{class_dir}: {len(files)} files")
      if files:
        img = cv2.imread(str(files[0]))
        print(f" Sample image shape: {img.shape if img is not None else 'Failed to load'}")
    else:
      print(f"{class_dir}: does not exist")


print("If any folder has 0 files, ensure the dataset/file path is placed correctly.")

Current working directory: c:\Users\ekowd\Desktop\FYP\FYP\notebooks
RAW_TRAIN_DIR exists: True -> C:\Users\ekowd\Desktop\FYP\FYP\data\raw_data\Training
RAW_TEST_DIR exists: True -> C:\Users\ekowd\Desktop\FYP\FYP\data\raw_data\Testing
CLEAN_DIR exists: True -> C:\Users\ekowd\Desktop\FYP\FYP\data\cleaned_data
..\data\raw_data\Training\glioma: 1321 files
 Sample image shape: (512, 512, 3)
..\data\raw_data\Testing\glioma: 300 files
 Sample image shape: (512, 512, 3)
..\data\raw_data\Training\meningioma: 1339 files
 Sample image shape: (512, 512, 3)
..\data\raw_data\Testing\meningioma: 306 files
 Sample image shape: (278, 440, 3)
..\data\raw_data\Training\notumor: 1595 files
 Sample image shape: (350, 350, 3)
..\data\raw_data\Testing\notumor: 405 files
 Sample image shape: (236, 236, 3)
..\data\raw_data\Training\pituitary: 1457 files
 Sample image shape: (512, 512, 3)
..\data\raw_data\Testing\pituitary: 300 files
 Sample image shape: (512, 512, 3)
If any folder has 0 files, ensure the datas

In [7]:
# Loading and Rezising Images (function)
def load_and_resize_images(class_name):
  """
  Loads images from both raw Training and Testing directories,
  resizes them to IMG_SIZE, and returns a list of (image_array, filename).
  """
  images = []


  for data_dir in [RAW_TRAIN_DIR, RAW_TEST_DIR]:
    class_dir = data_dir / class_name
    if not class_dir.exists():
      continue
    
    for file_path in class_dir.glob('*'):
      if file_path.suffix.lower() in ['.jpg', '.jpeg', '.png']:
        img = cv2.imread(str(file_path))
        if img is None:
          continue
        resized = cv2.resize(img, IMG_SIZE)
        images.append((resized, file_path.name))
  
  return images

In [8]:
# Saving Images to Cleaned Directory (function)

def save_images(images, save_dir, class_name):
  """
  Saves a list of (image_array, filename) into save_dir/class_name
  """
  for img_array, filename in images:
    save_path = save_dir / class_name / filename
    cv2.imwrite(str(save_path), img_array)

In [9]:
# Merge and Resize All Classes
# The performs the actually dataset merging and resizing

for cls in CLASSES:
  print(f"\nProcessing class: {cls}")

  images = load_and_resize_images(cls)
  total_images = len(images)

  print(f"Total merged images: {total_images}")

  if total_images == 0:
    print(f"Skipping {cls} – no images found.")
    continue

  print("Saving resized images to cleaned_data...")
  save_images(tqdm(images, leave=False), CLEAN_DIR / 'train', cls)

  print(f"Done: {cls} -> {total_images} images saved")

print("\nDataset merging and resizing completed successfully.")


Processing class: glioma
Total merged images: 1621
Saving resized images to cleaned_data...


                                                    

Done: glioma -> 1621 images saved

Processing class: meningioma
Total merged images: 1645
Saving resized images to cleaned_data...


                                                    

Done: meningioma -> 1645 images saved

Processing class: notumor
Total merged images: 2000
Saving resized images to cleaned_data...


                                                    

Done: notumor -> 2000 images saved

Processing class: pituitary
Total merged images: 1757
Saving resized images to cleaned_data...


                                                    

Done: pituitary -> 1757 images saved

Dataset merging and resizing completed successfully.


