In [1]:
from google.colab import drive
drive.mount("/content/drive")
import os
os.chdir("/content/drive/MyDrive/Colab Notebooks/Data Science Group Project/data/processed/mri")
print(os.getcwd())
import cv2
import matplotlib.pyplot as plt
from pathlib import Path
import torch
import numpy as np
from tqdm import tqdm
import albumentations as A
from albumentations.pytorch import ToTensorV2

Mounted at /content/drive
/content/drive/.shortcut-targets-by-id/1X35NgpoRZLTS0jivDsMTksGG26Hr7xU5/data/processed/mri


In [2]:
def show_image(img):
  fig, ax = plt.subplots(figsize=(6, 6))
  ax.imshow(img, cmap="grey")
  ax.axis("off")

In [3]:
class Preprocessing:
  def __init__(self, raw_dataset_path, processed_dataset_path, image_size):
    self.image_size = image_size
    self.raw_dataset_path = raw_dataset_path
    self.processed_dataset_path = processed_dataset_path
    self.source_folders = [f for f in Path(raw_dataset_path).rglob("*") if f.is_dir() and any(p.is_file() for p in f.iterdir())]

  def make_directory(self, source):
    processed_folder = Path(self.processed_dataset_path) / source.relative_to(self.raw_dataset_path)
    processed_folder.mkdir(parents=True, exist_ok=True)
    return processed_folder

  @staticmethod
  def denoise_img(img):
    if img.dtype != np.uint8:
      img_min, img_max = img.min(), img.max()
      if img_max <= 1.0:
        img = (img * 255).astype(np.uint8)
      else:
        img = img.astype(np.uint8)

    return cv2.medianBlur(img, ksize=3)

  def preprocess_image(self, img):
    img = self.denoise_img(img)
    img = cv2.resize(img, self.image_size, interpolation=cv2.INTER_LINEAR)
    return img

  def preprocess_images(self):
    for source in self.source_folders:
      processed_folder = self.make_directory(source)

      print(f"Processing: {source}")
      print(f"Outputting to : {processed_folder}")

      for img in tqdm(source.glob("*")):
        image = cv2.imread(str(img), cv2.IMREAD_GRAYSCALE)

        if image is None:
          print(f"Warning: could not read {img}, skipping")
          continue

        processed_img = self.preprocess_image(image)

        output_path = processed_folder / img.name
        cv2.imwrite(str(output_path), processed_img)

In [4]:
raw = "raw"
processed = "processed_mb_224"
IMAGE_SIZE = (224, 224)

preprocessor = Preprocessing(raw_dataset_path=raw, processed_dataset_path=processed, image_size=IMAGE_SIZE)
preprocessor.preprocess_images()

Processing: raw/train/glioma
Outputting to : processed_mb_224/train/glioma


1278it [01:16, 16.78it/s]


Processing: raw/train/meningioma
Outputting to : processed_mb_224/train/meningioma


1197it [00:39, 30.21it/s]


Processing: raw/train/pituitary
Outputting to : processed_mb_224/train/pituitary


706it [00:23, 29.89it/s]


Processing: raw/val/glioma
Outputting to : processed_mb_224/val/glioma


365it [00:10, 33.49it/s]


Processing: raw/val/meningioma
Outputting to : processed_mb_224/val/meningioma


342it [00:10, 31.91it/s]


Processing: raw/val/pituitary
Outputting to : processed_mb_224/val/pituitary


201it [00:06, 33.23it/s]


Processing: raw/test/glioma
Outputting to : processed_mb_224/test/glioma


183it [00:06, 30.01it/s]


Processing: raw/test/meningioma
Outputting to : processed_mb_224/test/meningioma


171it [00:04, 38.13it/s]


Processing: raw/test/pituitary
Outputting to : processed_mb_224/test/pituitary


102it [00:02, 34.38it/s]


In [5]:
train_transform = A.Compose([
    A.Rotate(limit=10, border_mode=cv2.BORDER_REFLECT, p=0.5),

    A.HorizontalFlip(p=0.5),

    ToTensorV2()
], additional_targets={'image0': 'image'})

val_test_transform = A.Compose([ToTensorV2()], additional_targets={'image0': 'image'})

def zscore_norm_tensor(x):
    x = x.float() / 255.0
    mean = x.mean(dim=[1, 2], keepdim=True)
    std = x.std(dim=[1, 2], keepdim=True, unbiased=False)
    return (x - mean) / (std + 1e-8)

In [6]:
class Transforming:
    def __init__(self, raw_dataset_path, processed_dataset_path, image_size, transform, apply_to, classes, save_to):
        self.raw_dataset_path = Path(raw_dataset_path)
        self.processed_dataset_path = Path(processed_dataset_path)
        self.image_size = image_size
        self.transform = transform
        self.apply_to = apply_to
        self.save_to = Path(save_to)
        self.class_to_idx = {cls: i for i, cls in enumerate(classes)}
        self.source_folders = [f for f in self.raw_dataset_path.rglob("*") if f.is_dir() and (str(f.parent.name) == self.apply_to) and (any(p.is_file() for p in f.iterdir()))]

    def transform_images(self):
        raw_imgs, proc_imgs, labels = [], [], []

        for source in self.source_folders:
            for img_file in tqdm(list(source.iterdir()), desc=f"Processing {source.name}"):
                raw_img = cv2.imread(str(img_file), cv2.IMREAD_GRAYSCALE)
                proc_img = cv2.imread(str(self.processed_dataset_path / source.relative_to(source.parent.parent) / img_file.name), cv2.IMREAD_GRAYSCALE)

                raw_img = cv2.resize(raw_img, self.image_size)
                proc_img = cv2.resize(proc_img, self.image_size)

                # Add channel dim
                raw_img = raw_img[..., None]
                proc_img = proc_img[..., None]

                augmented = self.transform(image=raw_img, image0=proc_img)
                raw_tensor = augmented['image']
                proc_tensor = augmented['image0']

                # Optional: z-score normalization
                raw_tensor = zscore_norm_tensor(raw_tensor)
                proc_tensor = zscore_norm_tensor(proc_tensor)

                raw_imgs.append(raw_tensor)
                proc_imgs.append(proc_tensor)
                labels.append(self.class_to_idx[source.name])

        raw_imgs = torch.stack(raw_imgs)
        proc_imgs = torch.stack(proc_imgs)
        labels = torch.tensor(labels, dtype=torch.long)

        self.save_to.mkdir(parents=True, exist_ok=True)

        torch.save((raw_imgs, proc_imgs, labels), f"{self.save_to}/{self.apply_to}.pt")

In [7]:
classes = ['glioma', 'meningioma', 'pituitary']

In [8]:
transforming_train = Transforming("raw", "processed_mb_224", IMAGE_SIZE, train_transform, "train", classes, "transform_v2")
transforming_train.transform_images()

Processing glioma: 100%|██████████| 1278/1278 [00:33<00:00, 38.52it/s]
Processing meningioma: 100%|██████████| 1197/1197 [00:28<00:00, 41.91it/s]
Processing pituitary: 100%|██████████| 706/706 [00:15<00:00, 44.37it/s]


In [9]:
transforming_val = Transforming("raw", "processed_mb_224", IMAGE_SIZE, val_test_transform, "val", classes, "transform_v2")
transforming_val.transform_images()

Processing glioma: 100%|██████████| 365/365 [00:07<00:00, 48.83it/s]
Processing meningioma: 100%|██████████| 342/342 [00:06<00:00, 53.55it/s]
Processing pituitary: 100%|██████████| 201/201 [00:03<00:00, 52.51it/s]


In [10]:
transforming_test = Transforming("raw", "processed_mb_224", IMAGE_SIZE, val_test_transform, "test", classes, "transform_v2")
transforming_test.transform_images()

Processing glioma: 100%|██████████| 183/183 [00:03<00:00, 46.54it/s]
Processing meningioma: 100%|██████████| 171/171 [00:02<00:00, 63.14it/s]
Processing pituitary: 100%|██████████| 102/102 [00:01<00:00, 61.44it/s]
