<a href="https://colab.research.google.com/github/Dylan-Geraci/neuroimaging-tumor-detector/blob/main/notebooks/01_explore_and_clean.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries

In [4]:
import os, glob, collections
import pandas as pd
import numpy as np
from google.colab import drive
from torchvision import datasets, transforms
from sklearn.model_selection import StratifiedShuffleSplit
from torch.utils.data import Dataset, DataLoader, Subset
from PIL import Image, UnidentifiedImageError
import numpy as np, tqdm, os, glob

# Set Training Data Path

In [5]:
drive.mount('/content/drive')
TRAIN_PATH = "/content/drive/MyDrive/neuro-imaging/data/Training"

Mounted at /content/drive


# Define Transformations

In [6]:
train_tfms = transforms.Compose([
    transforms.Grayscale(num_output_channels=3),
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225]),
])

val_tfms = transforms.Compose([
    transforms.Grayscale(num_output_channels=3),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225]),
])

In [7]:
base = datasets.ImageFolder(TRAIN_PATH)  # no transform here
y = np.array([label for _, label in base.samples])

# Splitting Data and Training

In [8]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.15, random_state=42)
train_idx, val_idx = next(sss.split(np.zeros(len(y)), y))

In [9]:
class TransformSubset(Dataset):
    def __init__(self, base_ds, indices, transform):
        self.base = base_ds
        self.indices = indices
        self.transform = transform
    def __len__(self): return len(self.indices)
    def __getitem__(self, i):
        path, label = self.base.samples[self.indices[i]]
        img = self.base.loader(path)           # PIL image
        if self.transform: img = self.transform(img)
        return img, label

train_ds = TransformSubset(base, train_idx, train_tfms)
val_ds   = TransformSubset(base, val_idx,   val_tfms)

In [10]:
train_dl = DataLoader(train_ds, batch_size=32, shuffle=True,  num_workers=2, pin_memory=True)
val_dl   = DataLoader(val_ds,   batch_size=32, shuffle=False, num_workers=2, pin_memory=True)

In [11]:
classes = base.classes
print(classes)

['glioma', 'meningioma', 'notumor', 'pituitary']


In [12]:
class_counts = {}
for cls in sorted(d for d in os.listdir(TRAIN_PATH) if os.path.isdir(os.path.join(TRAIN_PATH, d))):
    n = len(glob.glob(os.path.join(TRAIN_PATH, cls, "*")))
    class_counts[cls] = n
class_counts

{'glioma': 1321, 'meningioma': 1339, 'notumor': 1595, 'pituitary': 1457}

In [None]:
sizes = []
bad = []
for cls, n in class_counts.items():
    for p in glob.glob(os.path.join(TRAIN_PATH, cls, "*")):
        try:
            with Image.open(p) as im:
                sizes.append((*im.size, cls))  # (W,H,cls)
        except (UnidentifiedImageError, OSError):
            bad.append(p)

print("Corrupt files:", len(bad))
print("Examples:", bad[:3])
ws, hs = zip(*[(w,h) for w,h,_ in sizes])
print("W median/mean:", np.median(ws), np.mean(ws))
print("H median/mean:", np.median(hs), np.mean(hs))

## Visualization

In [None]:
plt.figure(figsize=(6,3))
plt.bar(class_counts.keys(), class_counts.values())
plt.title("Class counts")
plt.show()

ws, hs = zip(*[(w,h) for w,h,_ in sizes])
plt.figure(figsize=(10,3))
plt.subplot(1,2,1); plt.hist(ws, bins=30); plt.title("Width distribution")
plt.subplot(1,2,2); plt.hist(hs, bins=30); plt.title("Height distribution")
plt.tight_layout(); plt.show()