## Patient split

In [1]:
import os
import pickle
from collections import defaultdict

In [2]:
DATA_ROOT = "/data/scratch/qc25022/pancreas/tokenised_data_word_level/cprd_upgi"
SPLITS = ["train", "tuning", "held_out"]  # add any extra splits you used

def collect_subjects(split):
    split_dir = os.path.join(DATA_ROOT, split)
    subjects = set()
    for fname in os.listdir(split_dir):
        if not fname.endswith(".pkl"):
            continue
        with open(os.path.join(split_dir, fname), "rb") as f:
            for record in pickle.load(f):
                subjects.add(record["subject_id"])
    return subjects

split_subjects = {split: collect_subjects(split) for split in SPLITS}

# pairwise intersection report
for a in SPLITS:
    for b in SPLITS:
        if a >= b:
            continue
        overlap = split_subjects[a] & split_subjects[b]
        print(f"{a} ∩ {b}: {len(overlap)}")
        if overlap:
            print(sorted(list(overlap))[:20], "...")  # sample IDs if debugging

train ∩ tuning: 31
[29862151292, 85275550336, 119515850170, 154114250099, 233566951376, 236591550323, 252079650966, 308943650489, 357161451323, 440126050081, 580271950622, 663653650209, 671073751005, 883921650892, 919730151187, 999841951213, 1098339850231, 1107969350995, 1107984051061, 1175490050051] ...
held_out ∩ train: 31
[46769150755, 198123350377, 296742651040, 307420250806, 352566250206, 479875450346, 512631750810, 531489850080, 565650551076, 575435950411, 577684650964, 607911150269, 653984850621, 659473751128, 766021751155, 826132950659, 859353250945, 862857051077, 924505250131, 1024391150422] ...
held_out ∩ tuning: 6
[699772750809, 1123054650125, 1153887650945, 1286978550840, 1575689850232, 1612772050559] ...


## Trajectory Length

In [15]:
import numpy as np 
import pathlib
import sys
notebook_path = pathlib.Path().resolve()  # this is the directory Jupyter started in
print(notebook_path)
repo_root = notebook_path.parents[1]      # move up from src/resources to repo root
sys.path.insert(0, str(repo_root))
from src.data.unified_dataset import UnifiedEHRDataset

/data/home/qc25022/CancEHR-Training/src/resources


ModuleNotFoundError: No module named 'torch'

In [3]:
def describe_lengths(split, cutoff):
    dataset = UnifiedEHRDataset(
        data_dir=DATA_ROOT,
        vocab_file=VOCAB,
        labels_file=LABELS,
        medical_lookup_file=MEDICAL,
        lab_lookup_file=LAB,
        region_lookup_file=REGION,
        time_lookup_file=TIME,
        cutoff_months=cutoff,
        format="text",
        split=split,
        max_sequence_length=None,
    )
    lengths_chars = []
    lengths_tokens = []
    for item in dataset:
        if item is None:
            continue
        text = item["text"]
        lengths_chars.append(len(text))
        lengths_tokens.append(len(text.split()))  # crude word count; swap with tokenizer.encode if desired
    summary = lambda arr: dict(count=len(arr), mean=np.mean(arr), p95=np.percentile(arr, 95), max=max(arr))
    return summary(lengths_chars), summary(lengths_tokens)

for split in ["train", "tuning", "held_out"]:
    char_stats, token_stats = describe_lengths(split, cutoff=12)
    print(f"{split} char stats: {char_stats}")
    print(f"{split} token stats: {token_stats}")

NameError: name 'UnifiedEHRDataset' is not defined