<a href="https://colab.research.google.com/github/Adamh25/Amazon-sentiment-analysis/blob/main/Real_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install torchio -q


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.6/52.6 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.9/193.9 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.6/52.6 MB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
# --- 0. Mount Google Drive and set project paths ---
from google.colab import drive
import os, glob

drive.mount('/content/drive', force_remount=True)

# Find your LIFE363_CEREBRO project folder
ROOT = "/content/drive/MyDrive/LIFE363_CEREBRO"
assert os.path.isdir(ROOT), f"❌ Folder not found: {ROOT}"

# COSTA dataset inside it
COSTA = os.path.join(ROOT, "COSTA-Dataset-v1")
IMG_DIR = os.path.join(COSTA, "imagesTr")
LBL_DIR = os.path.join(COSTA, "labelsTr")

print("Project root:", ROOT)
print("imagesTr:", IMG_DIR)
print("labelsTr:", LBL_DIR)


Mounted at /content/drive
Project root: /content/drive/MyDrive/LIFE363_CEREBRO
imagesTr: /content/drive/MyDrive/LIFE363_CEREBRO/COSTA-Dataset-v1/imagesTr
labelsTr: /content/drive/MyDrive/LIFE363_CEREBRO/COSTA-Dataset-v1/labelsTr


In [5]:
import os, glob, numpy as np, nibabel as nib
from tqdm import tqdm
import torchio as tio  # only for convenient resampling

DIR_OUT = os.path.join(ROOT, "data_preprocessed")
os.makedirs(DIR_OUT, exist_ok=True)

TARGET_SPACING = (0.46875, 0.46875, 0.8)  # your ≈[0.47,0.47,0.80]
PCT = (1, 99)  # clipping percentiles

def clip_zscore(vol, p1=1, p99=99):
    lo, hi = np.percentile(vol, [p1, p99])
    vol = np.clip(vol, lo, hi)
    mu, sd = vol.mean(), vol.std() + 1e-8
    return (vol - mu) / sd

def save_preprocessed(img_path, lbl_path):
    # derive centre and basename
    centre = os.path.basename(os.path.dirname(img_path))
    base = os.path.basename(img_path).replace(".nii.gz","")

    # load
    I = nib.load(img_path); L = nib.load(lbl_path)
    I_np = I.get_fdata().astype(np.float32)
    L_np = L.get_fdata().astype(np.uint8)
    affine = I.affine

    # intensity preprocessing
    I_np = clip_zscore(I_np, *PCT)

    # build TorchIO images for resampling (uses spacing from headers)
    img_t = tio.ScalarImage(tensor=I_np[None, ...], affine=affine)
    lbl_t = tio.LabelMap(tensor=L_np[None, ...], affine=affine)

    # resample
    resample = tio.Resample(target=TARGET_SPACING)
    img_r = resample(img_t)
    lbl_r = resample(lbl_t)

    I_out = img_r.tensor.squeeze(0).numpy().astype(np.float32)
    L_out = lbl_r.tensor.squeeze(0).numpy().astype(np.uint8)
    aff_out = img_r.affine

    # write with unified naming
    out_img = os.path.join(DIR_OUT, f"{base}_{centre}_img.nii.gz")
    out_msk = os.path.join(DIR_OUT, f"{base}_{centre}_mask.nii.gz")
    nib.save(nib.Nifti1Image(I_out, aff_out), out_img)
    nib.save(nib.Nifti1Image(L_out, aff_out), out_msk)
    return out_img, out_msk

# build pairs from imagesTr/labelsTr
img_paths = sorted(glob.glob(f"{IMG_DIR}/**/*.nii.gz", recursive=True))
lbl_paths = sorted(glob.glob(f"{LBL_DIR}/**/*.nii.gz", recursive=True))
imgs = {os.path.basename(p): p for p in img_paths}
lbls = {os.path.basename(p): p for p in lbl_paths}
common = sorted(set(imgs) & set(lbls))

print(f"Preprocessing {len(common)} subjects → {DIR_OUT}")
done = 0
for name in tqdm(common):
    out_img, out_msk = save_preprocessed(imgs[name], lbls[name])
    done += 1
print("Done:", done)


Preprocessing 224 subjects → /content/drive/MyDrive/LIFE363_CEREBRO/data_preprocessed


100%|██████████| 224/224 [33:00<00:00,  8.84s/it]

Done: 224





In [6]:
# --- installs
!pip -q install monai-weekly nibabel torchio scikit-image matplotlib tqdm

import os, glob, shutil, random, json, time
import numpy as np, torch

# --- deterministic-ish
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = True

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# --- mount Drive robustly
from google.colab import drive
MOUNT = "/content/drive"
if not os.path.ismount(MOUNT):
    if os.path.isdir(MOUNT) and os.listdir(MOUNT):
        shutil.rmtree(MOUNT)
    os.makedirs(MOUNT, exist_ok=True)
    drive.mount(MOUNT, force_remount=True)

# --- find LIFE363 root + data_preprocessed
def find_dir(pattern):
    hits = glob.glob(f"{MOUNT}/MyDrive/**/{pattern}", recursive=True)
    if hits: return hits[0]
    raise FileNotFoundError(f"Couldn't find a folder named '{pattern}' under MyDrive.")

try:
    ROOT = find_dir("LIFE363_CEREBRO")
except:
    ROOT = find_dir("LIFE363")

try:
    DIR_PREP = find_dir("data_preprocessed")
except:
    # fallback: any folder with 'preprocess' in name
    DIR_PREP = glob.glob(f"{ROOT}/**/*preprocess*", recursive=True)[0]

DIR_CKPT = os.path.join(ROOT, "checkpoints"); os.makedirs(DIR_CKPT, exist_ok=True)
DIR_LOGS = os.path.join(ROOT, "logs"); os.makedirs(DIR_LOGS, exist_ok=True)
print("ROOT:", ROOT)
print("DIR_PREP:", DIR_PREP)
print("DIR_CKPT:", DIR_CKPT)
print("DIR_LOGS:", DIR_LOGS)


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.7 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.8/2.7 MB[0m [31m24.5 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m2.7/2.7 MB[0m [31m44.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/2.7 MB[0m [31m31.8 MB/s[0m eta [36m0:00:00[0m
[?25hDevice: cpu
ROOT: /content/drive/MyDrive/LIFE363_CEREBRO
DIR_PREP: /content/drive/MyDrive/LIFE363_CEREBRO/data_preprocessed
DIR_CKPT: /content/drive/MyDrive/LIFE363_CEREBRO/checkpoints
DIR_LOGS: /content/drive/MyDrive/LIFE363_CEREBRO/logs


In [7]:
# Find *_img.nii.gz / *_mask.nii.gz recursively
imgs  = sorted(glob.glob(f"{DIR_PREP}/**/*_img.nii.gz",  recursive=True))
masks = sorted(glob.glob(f"{DIR_PREP}/**/*_mask.nii.gz", recursive=True))

# Pair by common stem up to _img/_mask
def stem(p):
    b = os.path.basename(p)
    return b.replace("_img.nii.gz","").replace("_mask.nii.gz","")

by_stem = {}
for p in imgs:  by_stem.setdefault(stem(p), {})["img"]  = p
for p in masks: by_stem.setdefault(stem(p), {})["mask"] = p

pairs = [{"img": v["img"], "mask": v["mask"]} for k,v in by_stem.items() if "img" in v and "mask" in v]
print(f"Found {len(pairs)} image–mask pairs")
if len(pairs) < 2:
    print("⚠️ Need ≥2 for a split; proceed anyway but validation will be minimal.")

# Simple split: last N as validation (site/subject-wise split comes later)
VAL_COUNT = max(1, len(pairs)//5)  # ~20% val
train_files = pairs[:-VAL_COUNT] if len(pairs) > 1 else pairs
val_files   = pairs[-VAL_COUNT:]   if len(pairs) > 1 else pairs

print(f"Train: {len(train_files)} | Val: {len(val_files)}")
for d in val_files[:3]:
    print("VAL →", os.path.basename(d["img"]))


Found 226 image–mask pairs
Train: 181 | Val: 45
VAL → IXI426-IOP-1011-MRA_IXI-IOP_IXI-IOP_img.nii.gz
VAL → IXI430-IOP-0990-MRA_IXI-IOP_IXI-IOP_img.nii.gz
VAL → IXI462-IOP-1042-MRA_IXI-IOP_IXI-IOP_img.nii.gz


In [1]:
from monai.transforms import (
    LoadImaged, EnsureChannelFirstd, RandCropByPosNegLabeld,
    RandFlipd, RandRotate90d, ToTensord, Compose
)
from monai.data import CacheDataset, DataLoader

PATCH = (128,128,64)
PATCHES_PER_EPOCH = 256  # increase as you add data (e.g., 512+)
BATCH_SIZE = 2

train_t = Compose([
    LoadImaged(keys=["img","mask"]),
    EnsureChannelFirstd(keys=["img","mask"]),
    RandCropByPosNegLabeld(
        keys=["img","mask"], label_key="mask", spatial_size=PATCH,
        pos=2, neg=1, num_samples=PATCHES_PER_EPOCH, image_key="img", image_threshold=0
    ),
    RandFlipd(keys=["img","mask"], prob=0.5, spatial_axis=0),
    RandFlipd(keys=["img","mask"], prob=0.5, spatial_axis=1),
    RandFlipd(keys=["img","mask"], prob=0.5, spatial_axis=2),
    RandRotate90d(keys=["img","mask"], prob=0.5, max_k=3),
    ToTensord(keys=["img","mask"]),
])

val_t = Compose([
    LoadImaged(keys=["img","mask"]),
    EnsureChannelFirstd(keys=["img","mask"]),
    ToTensord(keys=["img","mask"]),
])

train_ds = CacheDataset(train_files, transform=train_t, cache_rate=1.0, num_workers=2)
val_ds   = CacheDataset(val_files,   transform=val_t,   cache_rate=1.0, num_workers=2)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)
val_loader   = DataLoader(val_ds,   batch_size=1,      shuffle=False, num_workers=2, pin_memory=True)

len(train_ds), len(val_ds)


KeyboardInterrupt: 