> **Note:** This notebook does not contain any raw or derived data from the Sleep Heart Health Study (SHHS).  
> All cell outputs have been removed to comply with the NSRR data use agreement.  
> To reproduce results, please obtain the dataset from [https://sleepdata.org/datasets/shhs](https://sleepdata.org/datasets/shhs).

In [None]:
import os, subprocess, random, pathlib, sys, platform, subprocess, json, textwrap, pathlib, shutil, mne, numpy as np, concurrent.futures as cf, matplotlib.pyplot as plt, tensorflow as tf, seaborn as sns
from datetime import datetime
from glob import glob
from itertools import groupby, islice
from tqdm import tqdm
from lxml import etree
from sklearn.metrics import roc_curve, roc_auc_score, RocCurveDisplay
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve, auc
from tensorflow.keras import layers, models, callbacks, mixed_precision, optimizers

In [None]:
def run(cmd):
    try:
        return subprocess.check_output(cmd, shell=True, text=True).strip()
    except Exception:
        return "N/A"

cpu_model = run("lscpu | grep -m1 'Model name' | cut -d ':' -f2- | xargs")
if cpu_model == "N/A":
    cpu_model = run("grep -m1 'model name' /proc/cpuinfo | cut -d ':' -f2- | xargs")

info = {
    "Timestamp"        : datetime.now().isoformat(timespec="seconds"),
    "Host"             : platform.node(),
    "OS"               : f"{platform.system()} {platform.release()}",
    "Python"           : platform.python_version(),
    "TensorFlow"       : tf.__version__,
    "TensorFlow Build" : tf.sysconfig.get_build_info().get("build_type","N/A"),
    "CUDA Built w/"    : tf.sysconfig.get_build_info().get("cuda_version","N/A"),
    "cuDNN Built w/"   : tf.sysconfig.get_build_info().get("cudnn_version","N/A"),
    "Num GPUs visible" : len(tf.config.experimental.list_physical_devices('GPU')),
    "GPU Name(s)"      : run("nvidia-smi --query-gpu=name --format=csv,noheader"),
    "GPU Driver"       : run("nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -1"),
    "CUDA Runtime"     : run("nvcc --version | grep release | awk '{print $6}'"),
    "cuDNN Runtime"    : run("grep -oP 'CUDNN_MAJOR\\s*=\\s*\\K[0-9]+' /usr/include/cudnn_version.h 2>/dev/null") + "." +
                          run("grep -oP 'CUDNN_MINOR\\s*=\\s*\\K[0-9]+' /usr/include/cudnn_version.h 2>/dev/null"),
    "CPU Model"        : cpu_model,
    "CPU (logical)"    : os.cpu_count(),
    "CPU (physical)"   : os.cpu_count()//2 if os.cpu_count() else "N/A",
    "RAM (free/total)" : run("free -h | awk '/Mem:/ {print $3\"/\"$2}'")
}

print("\n".join(f"{k:<16}: {v}" for k,v in info.items()))

In [None]:
OUT_DIR = pathlib.Path("./blocks_4signale_1Hz")
OUT_DIR.mkdir(exist_ok=True)
if not any(OUT_DIR.glob("block_*.npz")):
    DATA_DIR  = pathlib.Path("~/data/nsrr/shhs/polysomnography").expanduser()
    EDF_DIR, ANN_DIR = DATA_DIR/"edfs/shhs2", DATA_DIR/"annotations-events-nsrr/shhs2"
    WIN_SEC, STEP_SEC, MIN_APNEA_S = 60, 30, 10
    N_WORKER, CHUNK = 16, 600

    ORDER = ["SAO2", "HR", "THOR RES", "ABDO RES"]
    ALIAS = {
        "SAO2": ["SAO2","SPO2","SATS"],
        "HR":   ["PR","PULSERATE","HR"],
        "THOR RES":["THOR RES","THOR","THO"],
        "ABDO RES":["ABDO RES","ABD","ABDO"],
    }
    subj_id = lambda p: p.name.split("-")[1].split(".")[0]
    contig  = lambda a,n: any(len(list(g))>=n for k,g in groupby(a) if k)

    try:
        import cupy as xp; to_cpu = xp.asnumpy
    except ImportError:
        import numpy as xp; to_cpu = lambda a: a

    def apnea_events(xml):
        ok={"obstructive apnea","central apnea","mixed apnea","hypopnea"}
        for ev in etree.parse(xml).findall(".//ScoredEvent"):
            lab=f"{ev.findtext('EventType','').lower()}|{ev.findtext('EventConcept','').lower()}"
            if any(k in lab for k in ok):
                yield float(ev.findtext("Start")), float(ev.findtext("Duration"))

    def find_ch(raw, keys):
        up={c.upper():c for c in raw.ch_names}
        for k in keys:
            if k in up: return up[k]

    def process_one(edf_path):
        sid = subj_id(edf_path)
        out_p = OUT_DIR/f"block_{sid}.npz"
        if out_p.exists(): return sid

        raw = mne.io.read_raw_edf(edf_path, preload=True, verbose=False)
        chs = [find_ch(raw, ALIAS[c]) for c in ORDER]
        if None in chs:
            print(f"{sid}: Kanal fehlt – skip"); return sid
        raw.pick(chs).resample(1, npad="auto")

        X = xp.asarray(raw.get_data()).T.astype(xp.float32)
        L = X.shape[0]
        y = xp.zeros(L, xp.int8)
        for s,d in apnea_events(ANN_DIR/f"shhs2-{sid}-nsrr.xml"):
            y[int(s):int(min(s+d,L))] = 1

        X_blk, y_blk = [], []
        for t0 in range(0, L-WIN_SEC+1, STEP_SEC):
            seg = X[t0:t0+WIN_SEC]
            seg = (seg - seg.mean(0)) / (seg.std(0)+1e-8)
            X_blk.append(seg)
            y_blk.append(int(contig(y[t0:t0+WIN_SEC], MIN_APNEA_S)))

        if X_blk:
            np.savez_compressed(out_p,
                X=to_cpu(xp.stack(X_blk)).astype(np.float16),
                y=np.asarray(y_blk, np.int8),
                subj_id=np.asarray([sid]*len(y_blk), '<U10'))
        return sid

    edf_paths = sorted([p for p in EDF_DIR.glob("*.edf")
                        if (ANN_DIR/f"shhs2-{subj_id(p)}-nsrr.xml").exists()],
                       key=lambda p: p.stat().st_size, reverse=True)

    def chunked(it, n):
        it = iter(it)
        while (chunk := list(islice(it, n))):
            yield chunk

    for bi, batch in enumerate(chunked(edf_paths, CHUNK), 1):
        print(f"\nBatch {bi} – {len(batch)} Dateien")
        with cf.ProcessPoolExecutor(max_workers=N_WORKER) as pool:
            list(tqdm(pool.map(process_one, batch),
                      total=len(batch), desc=f"Batch {bi}"))

In [None]:
DATA_DIR = pathlib.Path("./blocks_4signale_1Hz")
all_npz  = list(DATA_DIR.glob("block_*.npz"))
print("Found PSG-NPZ:", len(all_npz))

random.shuffle(all_npz)
split = int(0.8 * len(all_npz))
train_files, val_files = all_npz[:split], all_npz[split:]

def npz_to_windows(path):
    d = np.load(str(path), mmap_mode="r")
    X = d["X"].astype("float16")            # (n_win, 60, 4)
    y = d["y"].astype("int8")
    for xi, yi in zip(X, y):
        yield xi, yi

def make_dataset(files, batch=256, shuffle=True):
    def gen():
        for p in files:
            d = np.load(str(p), mmap_mode="r")
            X = d["X"].astype("float16")
            y = d["y"].astype("int8")
            for xi, yi in zip(X, y):
                yield xi, yi

    ds = tf.data.Dataset.from_generator(
            gen,
            output_signature=(
                tf.TensorSpec((60, 4), tf.float16),
                tf.TensorSpec((),      tf.int8)))
    if shuffle:
        ds = ds.shuffle(10_000)
    return ds.batch(batch).prefetch(tf.data.AUTOTUNE)

BATCH = 256
train_ds = make_dataset(train_files, BATCH, shuffle=True).repeat()
val_ds   = make_dataset(val_files,  BATCH, shuffle=False)

# Steps per epoch (20 % of all training windows)
FRACTION = 0.20
tot_train_win = sum(np.load(p, mmap_mode='r')['y'].size for p in train_files)
steps_per_epoch = int(FRACTION * tot_train_win / BATCH)
val_steps = 1000
print(f"steps/epoch = {steps_per_epoch},  val_steps = {val_steps}")

In [None]:
mixed_precision.set_global_policy('mixed_float16')

def make_model():
    inp = layers.Input(shape=(60,4))
    x   = layers.Conv1D(64, 3, padding='same', activation='relu')(inp)
    x   = layers.BatchNormalization()(x)
    x   = layers.MaxPool1D(2)(x)

    x   = layers.Conv1D(128, 3, padding='same', activation='relu')(x)
    x   = layers.BatchNormalization()(x)
    x   = layers.MaxPool1D(2)(x)

    x   = layers.Conv1D(256, 3, padding='same', activation='relu')(x)
    x   = layers.GlobalAveragePooling1D()(x)

    x   = layers.Dense(128, activation='relu')(x)
    x   = layers.Dropout(0.3)(x)
    out = layers.Dense(1, activation='sigmoid', dtype='float32')(x)
    return models.Model(inp, out)

model = make_model()
model.summary()

model.compile(
    optimizers.Adam(1e-3),
    loss='binary_crossentropy',
    metrics=[tf.keras.metrics.AUC(name='AUC'),
             tf.keras.metrics.Precision(name='Prec'),
             tf.keras.metrics.Recall(name='Rec')]
)

In [None]:
BATCH     = 256
FRACTION  = 0.20   # 20 %
tot_train = sum(np.load(p, mmap_mode='r')["y"].size for p in train_files)
steps_ep  = int(FRACTION * tot_train / BATCH)
val_steps = 1000
print(f"steps/epoch = {steps_ep},  val_steps = {val_steps}")

# Mixed Precision
mixed_precision.set_global_policy('mixed_float16')

model = make_model()
model.compile(
    optimizer=optimizers.Adam(1e-3),
    loss='binary_crossentropy',
    metrics=[tf.keras.metrics.AUC(name='AUC'),
             tf.keras.metrics.Precision(name='Prec'),
             tf.keras.metrics.Recall(name='Rec')]
)

cb = [
    callbacks.ReduceLROnPlateau('val_AUC', factor=0.3, patience=3, mode='max'),
    callbacks.EarlyStopping('val_AUC', patience=6,
                            restore_best_weights=True, mode='max')
]

history = model.fit(
    train_ds.repeat(),
    epochs=50,
    steps_per_epoch=steps_ep,
    validation_data=val_ds,
    validation_steps=val_steps,
    callbacks=cb
)

stop_epoch = cb[1].stopped_epoch
best_val   = max(history.history["val_AUC"])
print(f"Training stoppte bei Epoche {stop_epoch}.  Best val_AUC = {best_val:.4f}")

In [None]:
# save model and history
SAVE_DIR = pathlib.Path("./apnea_cnn_saved")
SAVE_DIR.mkdir(parents=True, exist_ok=True)

# Native Keras-Container
model.save(SAVE_DIR / "net.keras", include_optimizer=False)

# weights only
model.save_weights(SAVE_DIR / "net.weights.h5")

# TensorFlow-SavedModel
model.export(SAVE_DIR / "saved_model")

# History as JSON
import json
with open(SAVE_DIR / "history.json", "w") as f:
    json.dump({k: [float(x) for x in v] for k, v in history.history.items()}, f, indent=2)

print("saved to ", SAVE_DIR)



In [None]:
MODEL_DIR = pathlib.Path("./apnea_cnn_saved")

model = tf.keras.models.load_model(MODEL_DIR / "net.keras",
                                   compile=False)
print("Model loaded.")

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])


In [None]:
# eval + confusion

# collect labels and scores
y_true, y_score = [], []
for xb, yb in val_ds.take(val_steps): # val_ds from training
    y_true.append(yb.numpy().astype(int))
    y_score.append(model.predict(xb, verbose=0).squeeze())

y_true = np.concatenate(y_true)
y_score= np.concatenate(y_score)
y_pred = (y_score >= 0.5).astype(int)

roc_auc = roc_auc_score(y_true, y_score)
prec, rec, _ = precision_recall_curve(y_true, y_score)
pr_auc  = auc(rec, prec)
print(classification_report(y_true, y_pred, digits=3))
print(f"ROC-AUC : {roc_auc:.3f}")
print(f"PR-AUC  : {pr_auc:.3f}")

# Confusion-Matrix
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(4,3))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['No Apnea','Apnea'],
            yticklabels=['No Apnea','Apnea'])
plt.title('Confusion Matrix (validation windows)')
plt.xlabel('Predicted'); plt.ylabel('True')
plt.tight_layout(); plt.show()

In [None]:
# Validation-ROC
fpr_val, tpr_val, _ = roc_curve(y_true, y_score)
auc_val = roc_auc_score(y_true, y_score)

# SHHS-1-ROC
ŷ_shhs  = model.predict(X_shhs, verbose=0)[:, 0]
fpr_s, tpr_s, _ = roc_curve(y_shhs, ŷ_shhs)
auc_shhs = roc_auc_score(y_shhs, ŷ_shhs)

fig, ax = plt.subplots(figsize=(6, 5))
RocCurveDisplay(fpr=fpr_val, tpr=tpr_val, roc_auc=auc_val,
                estimator_name="Validation").plot(ax=ax)
RocCurveDisplay(fpr=fpr_s,   tpr=tpr_s,   roc_auc=auc_shhs,
                estimator_name="SHHS-1").plot(ax=ax)

ax.plot([0, 1], [0, 1], 'k--', lw=.8) # Random-Baseline
ax.set_title("ROC-Comparison: Validation vs. SHHS-1")
ax.grid(True)
plt.tight_layout();  plt.show()
