In [1]:
import sys
import os, json, pathlib, shutil
import gc
import re
import cv2
import math
import numpy as np
import pandas as pd
import polars as pl
import pydicom
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.models as models
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import Dataset, TensorDataset, DataLoader, random_split, Subset
import timm
from collections import defaultdict
from typing import List, Tuple
import shutil
import matplotlib.pyplot as plt
import random
import sklearn
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
from IPython.display import display
import joblib
from joblib import Parallel, delayed
from pathlib import Path
import h5py, numpy as np

ROOT = "/kaggle/input/rsna-intracranial-aneurysm-detection"  
sys.path.append(ROOT)  # parent of kaggle_evaluation

import kaggle_evaluation.rsna_inference_server as rsna_eval



In [3]:
def _unit(v):
    v = np.asarray(v, float)
    n = np.linalg.norm(v)
    return v / (n + 1e-12)

def _slice_normal_from_iop(iop):
    r = _unit(iop[:3]) # row direction
    c = _unit(iop[3:]) # col direction
    n = np.cross(r, c) # slice normal
    return _unit(n)

def _scalar_pos_along_normal(ds, n):
    # project IPP onto the normal (dot product) (https://discovery.ucl.ac.uk/id/eprint/10146893/1/geometry_medim.pdf)
    p = np.asarray(getattr(ds, "ImagePositionPatient", [0,0,0]), float) # or default to [0,0,0]
    return float(np.dot(n, p))

def _physical_sorted_paths(files):
    # read first file to get IOP (row/col vectors)
    ds0 = pydicom.dcmread(str(files[0]), stop_before_pixels=True) # assuming IOP is the same for all slices
    iop = np.asarray(getattr(ds0, "ImageOrientationPatient", [1,0,0,0,1,0]), float) # or default to [1,0,0,0,1,0]
    n = _slice_normal_from_iop(iop)

    keyed = []
    for fp in files:
        ds = pydicom.dcmread(str(fp), stop_before_pixels=True)
        s = _scalar_pos_along_normal(ds, n)
        keyed.append((s, fp)) # pair of tuples (call this pair t)
    keyed.sort(key=lambda t: t[0]) # sort by first thing in pair (s)    
    return [pair[1] for pair in keyed], n # once sorted, only need file

def _percentile_slice_indices(Z, target_slices, p_lo=15.0, p_hi=85.0):
    if Z <= 0 or target_slices <= 0:
        return np.array([], dtype=int)
            
    # percentiles spaced linearly from p_lo to p_hi (inclusive)
    ps = np.linspace(p_lo, p_hi, num=target_slices)
    idx = np.round((ps / 100.0) * (Z - 1)).astype(int)
    idx = np.clip(idx, 0, Z - 1)

    # de-duplicate while preserving order (can happen if Z is small) 
    uniq, first_pos = np.unique(idx, return_index=True) 
    idx = idx[np.sort(first_pos)] # Same values as np.unique, but derived from original order
    # pad if needed
    if len(idx) < target_slices:
        pad = np.full(target_slices - len(idx), idx[-1] if len(idx) else 0, dtype=int)
        idx = np.concatenate([idx, pad])
    return idx

In [4]:
def preprocess_series(series_dir, target_hw = 256, target_slices = 32, 
                      p_lo = 15.0, p_hi = 85.0):
    series_dir = Path(series_dir)
    files = sorted(series_dir.glob("*.dcm"))
    if not files:
        raise FileNotFoundError(f"No DICOMs in {series_dir}")

    # slice ordering
    try:
        paths, n = _physical_sorted_paths(files)
    except Exception:
        def sort_key(fp):
            ds = pydicom.dcmread(str(fp), stop_before_pixels=True)
            inst = getattr(ds, "InstanceNumber", 0)
            return float(inst)
        paths = sorted(files, key=sort_key)

    Z = len(paths)
    if Z == 0:
        return np.empty((0, target_hw, target_hw), dtype=np.float32)

    # choose which indices to load
    if target_slices is None:
        # use ALL slices
        sel_paths = paths
    else:
        # percentile subset
        idx = _percentile_slice_indices(Z, target_slices, p_lo=p_lo, p_hi=p_hi)
        sel_paths = [paths[i] for i in idx]

    # read one header for modality
    ds0 = pydicom.dcmread(str(paths[0]), stop_before_pixels=True)
    modality = getattr(ds0, "Modality", "Unknown")

    # preallocate final volume 
    vol = np.empty((len(sel_paths), target_hw, target_hw), dtype=np.float32)

    for k, fp in enumerate(sel_paths):
        ds = pydicom.dcmread(str(fp), force=True, defer_size="1 KB")

        arr = ds.pixel_array  
        if arr.ndim == 3 and arr.shape[0] > 1 and arr.shape[-1] != 3:
            arr = arr.mean(axis=0).astype(arr.dtype, copy=False)
        if arr.ndim == 3 and arr.shape[-1] == 3:  # RGB -> gray
            arr = arr.mean(axis=-1)

        arr_small = cv2.resize(arr, (target_hw, target_hw), interpolation=cv2.INTER_AREA)

        img = arr_small.astype(np.float32, copy=False)
        slope = float(getattr(ds, "RescaleSlope", 1.0))
        inter = float(getattr(ds, "RescaleIntercept", 0.0))
        img *= slope
        img += inter

        if getattr(ds, "PhotometricInterpretation", "MONOCHROME2") == "MONOCHROME1":
            mmax = float(img.max()); mmin = float(img.min())
            img *= -1.0
            img += (mmax + mmin)

        vol[k] = img

        del ds, arr, arr_small, img

    # clip / normalize (in-place)  
    if modality in {"CT", "CTA"}:
        lo, hi = 0.0, 500.0   
    else:
        vals = vol[vol != 0]
        if vals.size >= 100:
            lo, hi = np.percentile(vals, [5, 95])
        else:
            lo, hi = np.percentile(vol, [5, 95])

    np.clip(vol, lo, hi, out=vol)
    vol -= lo
    vol /= (hi - lo + 1e-6)

    return vol.astype(np.float16, copy=False)

In [5]:
ID_COL = 'SeriesInstanceUID'

# 13 location columns
LOCATION_COLS = [
    'Left Infraclinoid Internal Carotid Artery',
    'Right Infraclinoid Internal Carotid Artery',
    'Left Supraclinoid Internal Carotid Artery',
    'Right Supraclinoid Internal Carotid Artery',
    'Left Middle Cerebral Artery',
    'Right Middle Cerebral Artery',
    'Anterior Communicating Artery',
    'Left Anterior Cerebral Artery',
    'Right Anterior Cerebral Artery',
    'Left Posterior Communicating Artery',
    'Right Posterior Communicating Artery',
    'Basilar Tip',
    'Other Posterior Circulation',
]

LABEL_COLS = LOCATION_COLS + ['Aneurysm Present']

data_path = Path("/kaggle/input/rsna-intracranial-aneurysm-detection")
series_dir = data_path / "series"

# read labels
df = pd.read_csv(data_path / "train.csv")

# add the folder path for each series
df['path'] = df['SeriesInstanceUID'].apply(lambda uid: series_dir / str(uid))

# keep only rows whose folder actually exists
df = df[df['path'].apply(lambda p: p.exists())].reset_index(drop = True)

# take a balanced sample
n_per_class = 1863
if n_per_class is not None:
    pos = df[df["Aneurysm Present"] == 1].sample(min(n_per_class, (df["Aneurysm Present"] == 1).sum()), random_state = 0)
    neg = df[df["Aneurysm Present"] == 0].sample(min(n_per_class, (df["Aneurysm Present"] == 0).sum()), random_state = 0)
    df = pd.concat([pos, neg]).sample(frac = 1, random_state = 0).reset_index(drop = True)

# ensure labels are numerical ints
df[LABEL_COLS] = (
    df[LABEL_COLS].apply(pd.to_numeric, errors = 'coerce').fillna(0).astype(int)
)

series_list = list(zip(df['path'].tolist(), df[LABEL_COLS].to_numpy(dtype=int)))

print(len(series_list), "series ready")
print(series_list[0][0]) # a path
print(series_list[0][1]) # a label

3726 series ready
/kaggle/input/rsna-intracranial-aneurysm-detection/series/1.2.826.0.1.3680043.8.498.11798530207335736916333444551246253735
[0 0 0 0 0 0 1 0 0 0 0 0 0 1]


In [None]:
# Canonical order so all shards pick the SAME subjects deterministically
series_list = sorted(series_list, key=lambda t: t[0].name) # t = (Path, labels)
N = len(series_list)
print("Total subjects:", N)

K = 4  # total shards
shard_id = 1 # CHANGE per notebook

# which subjects belong to this shard
idxs = [i for i in range(N) if i % K == shard_id]
print(f"Shard {shard_id}/{K}: {len(idxs)} subjects")

H, W = 256, 256
L = len(LABEL_COLS)

h5_path = f"/kaggle/working/dataset_shard_{shard_id}_all_slices.h5"

with h5py.File(h5_path, "w") as f:
    x_grp = f.create_group("x")
    y_ds = f.create_dataset("y", shape=(len(idxs), L), dtype="int16", compression="gzip")
    uid_ds = f.create_dataset("uid", shape=(len(idxs),), dtype=h5py.string_dtype(encoding="utf-8"))
    zlen_ds = f.create_dataset("z_len", shape=(len(idxs),), dtype="int16")

    for j, i in enumerate(idxs):
        sd, y = series_list[i]

        vol = preprocess_series(sd, target_hw=256, target_slices= 32) 
        Z_i = vol.shape[0]

        x_grp.create_dataset(
            name = str(j),
            data = vol,
            dtype = "float16",
            compression = "gzip"
        )

        y_ds[j] = y.astype("int16")
        uid_ds[j] = sd.name
        zlen_ds[j] = Z_i

        del vol
        if (j + 1) % 10 == 0:
            gc.collect()

        if (j + 1) % 50 == 0:
            f.flush()
            print(f"Shard {shard_id}: wrote {j+1}/{len(idxs)}")

print("Saved shard to:", h5_path)

Total subjects: 3726
Shard 1/4: 932 subjects
