In [1]:
import os
import glob
import argparse
import h5py
import numpy as np
%tb

MAX_PRINT_ELEMENTS = 100 

def print_dataset(name, ds):
    try:
        size = ds.size
        dtype = ds.dtype
        shape = ds.shape
        print(f"  Dataset: {name}")
        print(f"    shape: {shape}, dtype: {dtype}, elements: {size}")
        if size == 0:
            print("    (empty)")
            return
        if size <= MAX_PRINT_ELEMENTS:
            data = ds[()]
            if np.isscalar(data):
                print(f"    value: {data}")
            else:
                print(f"    value:\n{data}")
        else:
            try:
                flat_sample = np.asarray(ds).ravel()[:10]
                print(f"    sample (first 10 elements): {flat_sample}")
            except Exception:
                print("    (large dataset â€” sample unavailable)")
    except Exception as e:
        print(f"    (error reading dataset {name}: {e})")

def process_file(path):
    print(f"File: {path}")
    try:
        with h5py.File(path, "r") as f:
            def visitor(name, obj):
                if isinstance(obj, h5py.Dataset):
                    print_dataset(name, obj)
            f.visititems(visitor)
    except Exception as e:
        print(f"  Failed to open {path}: {e}")


directory = "./data"
if not os.path.isdir(directory):
    print(f"Directory not found: {directory}")

patterns = [os.path.join(directory, "*.h5")]
files = []
for p in patterns:
    files.extend(glob.glob(p))
files = sorted(files)

if not files:
    print(f"No .h5 files found in {directory}")

for path in files:
    process_file(path)

File: ./data/TCGA-AA-3861-01Z-00-DX1.1735d004-51bd-447a-add4-05f0c583c6ca.h5
  Dataset: data/average_patch_feats
    shape: (3099, 1280), dtype: float32, elements: 3966720
    sample (first 10 elements): [-0.72493017  5.0464234  -0.34299856 -0.1348111   0.4592898   0.31684345
 -0.31926262 -0.45190445  0.12918897  0.13533762]
  Dataset: data/feats
    shape: (3099, 1280), dtype: float32, elements: 3966720
    sample (first 10 elements): [-0.6249251   1.4586432   0.5834542   0.2807753  -0.59761536 -0.48566413
  0.46754742 -1.4798166   0.34278214 -0.27061844]
  Dataset: patch_grid/patch_coords
    shape: (3099, 2), dtype: float64, elements: 6198
    sample (first 10 elements): [43836.55913978  8670.96774194 44318.27956989  8670.96774194
 44800.          8670.96774194 45281.72043011  8670.96774194
 42873.11827957  9152.68817204]


No traceback available to show.


In [2]:

with h5py.File("./data/TCGA-AA-3861-01Z-00-DX1.1735d004-51bd-447a-add4-05f0c583c6ca.h5", "r") as f:
    feats = f["data/feats"]      
    avg   = f["data/average_patch_feats"]      
    coords = f["patch_grid/patch_coords"]

    print("feats:", feats.shape, feats.dtype)
    print("average_patch_feats:", avg.shape, avg.dtype)
    print("patch_coords:", coords.shape, coords.dtype)


    print("First vector (10 dims):", feats[0, :10])

    X = np.asarray(feats)                       
    Xn = X / (np.linalg.norm(X, axis=1, keepdims=True) + 1e-12)
    sims = Xn @ Xn[0]
    top_idx = np.argsort(-sims)[:10]
    print("Top-10 similar patches to #0:", top_idx, sims[top_idx])

    mean_vec = X.mean(axis=0)
    print("Mean vs stored avg (L2 diff):", np.linalg.norm(mean_vec - np.asarray(avg)))

feats: (3099, 1280) float32
average_patch_feats: (3099, 1280) float32
patch_coords: (3099, 2) float64
First vector (10 dims): [-0.6249251   1.4586432   0.5834542   0.2807753  -0.59761536 -0.48566413
  0.46754742 -1.4798166   0.34278214 -0.27061844]
Top-10 similar patches to #0: [   0 2901  715   22 1586 2907  273  622   10  803] [0.9999999  0.84278667 0.8411796  0.82864296 0.8147952  0.8108687
 0.7948091  0.7936531  0.7912672  0.7860223 ]
Mean vs stored avg (L2 diff): 2137.8137
Top-10 similar patches to #0: [   0 2901  715   22 1586 2907  273  622   10  803] [0.9999999  0.84278667 0.8411796  0.82864296 0.8147952  0.8108687
 0.7948091  0.7936531  0.7912672  0.7860223 ]
Mean vs stored avg (L2 diff): 2137.8137


In [3]:
import torch, torch.nn as nn, torch.optim as optim
import numpy as np, pandas as pd, h5py
from pathlib import Path

H5_DIR = Path("./data")
labels = pd.read_csv("./data/labels.csv")
labels = labels.set_index("slide_id")["label"]
print(labels.head())

slides = [p.stem for p in H5_DIR.glob("*.h5")]
print(slides)

slides = [p.stem for p in H5_DIR.glob("*.h5") if p.stem in labels.index]

print(f"Found {len(slides)} slides with labels.")

# slides_train = slides[:int(0.8*len(slides))]
# slides_val   = slides[int(0.8*len(slides)):]

slides_train = slides #TODO: test only
slides_val   = slides

class H5SlideBags(torch.utils.data.Dataset):
    def __init__(self, h5_dir: Path, slide_ids, labels):
        self.h5_dir = h5_dir
        self.slide_ids = slide_ids
        self.labels = labels

    def __len__(self): return len(self.slide_ids)

    def __getitem__(self, i):
        sid = self.slide_ids[i]
        with h5py.File(self.h5_dir/f"{sid}.h5", "r") as f:
            X = f["data/feats"][:]     # (N_tiles, D)
        # Optional stabilisieren: L2-Norm je Patch
        X = X / (np.linalg.norm(X, axis=1, keepdims=True) + 1e-12)
        y = int(self.labels.loc[sid])
        return torch.from_numpy(X.astype(np.float32)), torch.tensor(y).long(), sid

class AttnMIL(nn.Module):
    def __init__(self, d_in=1280, d_h=256, n_classes=2):
        super().__init__()
        self.proj = nn.Sequential(nn.LayerNorm(d_in), nn.Linear(d_in, d_h), nn.GELU())
        self.attn = nn.Sequential(nn.Linear(d_h, d_h), nn.Tanh(), nn.Linear(d_h, 1))
        self.head = nn.Sequential(nn.Linear(d_h, d_h), nn.GELU(), nn.Dropout(0.2), nn.Linear(d_h, n_classes))
    def forward(self, X):                   # X: (N, d_in)
        H = self.proj(X)                    # (N, d_h)
        a = self.attn(H).squeeze(-1)        # (N,)
        w = torch.softmax(a, dim=0)         # (N,)
        bag = (w.unsqueeze(-1) * H).sum(0)  # (d_h,)
        logits = self.head(bag.unsqueeze(0))# (1,C)
        return logits, w

train_ds = H5SlideBags(H5_DIR, slides_train, labels)
val_ds   = H5SlideBags(H5_DIR, slides_val,   labels)

# batch_size=1: ein Slide pro Schritt (variable Tile-Anzahl)
train_loader = torch.utils.data.DataLoader(train_ds, batch_size=1, shuffle=True)
val_loader   = torch.utils.data.DataLoader(val_ds,   batch_size=1, shuffle=False)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = AttnMIL(d_in=1280, d_h=256, n_classes=2).to(device)
opt = optim.AdamW(model.parameters(), lr=2e-4, weight_decay=1e-4)
lossf = nn.CrossEntropyLoss()

from sklearn.metrics import roc_auc_score

for epoch in range(20):
    # ---- Train
    model.train(); losses=[]
    for X,y,_ in train_loader:
        X = X[0].to(device)   # (N_tiles, D)
        y = y.to(device)
        opt.zero_grad()
        logits,_ = model(X)
        loss = lossf(logits, y)
        loss.backward(); opt.step()
        losses.append(loss.item())

    # ---- Val
    model.eval(); probs=[]; ys=[]; ids=[]
    with torch.no_grad():
        for X,y,sid in val_loader:
            X = X[0].to(device)
            p = torch.softmax(model(X)[0], dim=1)[0,1].item()
            probs.append(p); ys.append(int(y)); ids.append(sid[0])
    try:
        auroc = roc_auc_score(ys, probs)
    except:
        auroc = float('nan')
    print(f"Epoch {epoch:02d} | loss {np.mean(losses):.4f} | val AUROC {auroc:.3f}")
    
# Example output visualization for one slide
import matplotlib.pyplot as plt
slide_id = slides_val[0]
with h5py.File(H5_DIR/f"{slide_id}.h5", "r") as f:
    coords = f["patch_grid/patch_coords"][:]  # (N_tiles, 2)
    feats = f["data/feats"][:]                # (N_tiles, D)
    X = torch.from_numpy(feats.astype(np.float32)).to(device)
    _, attn_weights = model(X)                # attn_weights: (N_tiles,
    attn_weights = attn_weights.cpu().numpy()
    plt.figure(figsize=(6,6))
    plt.scatter(coords[:,0], coords[:,1], c=attn_weights, cmap='viridis', s=20)
    plt.colorbar(label='Attention Weight')
    plt.title(f'Attention Weights for Slide {slide_id}')    

slide_id
TCGA-AA-3861-01Z-00-DX1.1735d004-51bd-447a-add4-05f0c583c6ca    1
Name: label, dtype: int64
['TCGA-AA-3861-01Z-00-DX1.1735d004-51bd-447a-add4-05f0c583c6ca']
Found 1 slides with labels.




Epoch 00 | loss 0.6211 | val AUROC nan
Epoch 01 | loss 0.5003 | val AUROC nan
Epoch 02 | loss 0.3944 | val AUROC nan




Epoch 03 | loss 0.2794 | val AUROC nan
Epoch 04 | loss 0.1818 | val AUROC nan
Epoch 05 | loss 0.1702 | val AUROC nan




Epoch 06 | loss 0.1033 | val AUROC nan
Epoch 07 | loss 0.0524 | val AUROC nan
Epoch 08 | loss 0.0480 | val AUROC nan




Epoch 09 | loss 0.0254 | val AUROC nan
Epoch 10 | loss 0.0135 | val AUROC nan
Epoch 11 | loss 0.0153 | val AUROC nan




Epoch 12 | loss 0.0047 | val AUROC nan
Epoch 13 | loss 0.0069 | val AUROC nan
Epoch 14 | loss 0.0019 | val AUROC nan




Epoch 15 | loss 0.0013 | val AUROC nan
Epoch 16 | loss 0.0005 | val AUROC nan
Epoch 17 | loss 0.0003 | val AUROC nan




Epoch 18 | loss 0.0002 | val AUROC nan
Epoch 19 | loss 0.0004 | val AUROC nan


RuntimeError: Can't call numpy() on Tensor that requires grad. Use tensor.detach().numpy() instead.