## ISAAC - Data preparation

Run `0-isaac-load.ipynb` to:
- create stratified train/val/test splits
- subsample training sets
- construct and cache the CTCF PWM from JASPAR


In [1]:
# ===============================
# PAPER CONFIGURATION
# ===============================

# Data source: Ghosh et al., IJMS 2024 (TF binding, ENCODE)
# https://doi.org/10.3390/ijms25094990
PAPER_DATASETS = [
    "Data_A549_random.csv",
    "Data_GM12878_random.csv",
    "Data_Hepg2_random.csv",
]

# data splitting
SPLIT_SEED = 0
TEST_SIZE = 0.2
VAL_SIZE = 0.1

# training subsampling
TRAIN_N = 100_000
SUBSAMPLE_SEED = 0

# regulatory prior
PWM_ID = "MA0139.1"     # CTCF
PWM_PSEUDOCOUNT = 0.5


In [2]:
from pathlib import Path
from src.data_pipeline import process_dataset, stratified_subsample, load_or_create_ctcf_pwm

RAW_DIR = Path("data/raw")
PROC_DIR = Path("data/processed")
SPLIT_DIR = Path("data/splits")
PWM_PATH = PROC_DIR / "ctcf_pwm.npy"


In [3]:
for csv_path in sorted(RAW_DIR.glob("Data_*_random.csv")):
    if csv_path.name not in PAPER_DATASETS:
        print(f"[SKIP] {csv_path.name} (not used in paper)")
        continue

    process_dataset(
        csv_path=csv_path,
        processed_dir=PROC_DIR,
        splits_dir=SPLIT_DIR,
        seed=SPLIT_SEED,
        test_size=TEST_SIZE,
        val_size=VAL_SIZE
    )

# ---- training subsampling ----
for split_dir in sorted(SPLIT_DIR.iterdir()):
    train_csv = split_dir / "train.csv"
    out_csv = split_dir / "train_subsampled.csv"

    if not train_csv.exists() or out_csv.exists():
        continue

    props = stratified_subsample(
        train_csv=train_csv,
        out_csv=out_csv,
        train_n=TRAIN_N,
        seed=SUBSAMPLE_SEED,
    )

    with open(split_dir / "subsample_info.txt", "w") as f:
        f.write(
            f"TRAIN_N={TRAIN_N}\n"
            f"SEED={SUBSAMPLE_SEED}\n"
            f"label_proportions={props}\n"
        )


[DATA] Processing dataset: Data_A549
[DATA] N samples (raw): 500000
[DATA] Sequence length: 576
[DATA] Label distribution:
[DATA] label
0    0.502376
1    0.497624
[DATA] Saved processed file: data\processed\Data_A549_full_sequences.csv
[DATA] Saved splits in: data\splits\Data_A549
[DATA] Processing dataset: Data_GM12878
[DATA] N samples (raw): 500000
[DATA] [WARN] Dropping 2 invalid sequences (4.00e-06)
[DATA] Sequence length: 576
[DATA] Label distribution:
[DATA] label
0    0.50038
1    0.49962
[DATA] Saved processed file: data\processed\Data_GM12878_full_sequences.csv
[DATA] Saved splits in: data\splits\Data_GM12878
[DATA] Processing dataset: Data_Hepg2
[DATA] N samples (raw): 500000
[DATA] Sequence length: 576
[DATA] Label distribution:
[DATA] label
0    0.502942
1    0.497058
[DATA] Saved processed file: data\processed\Data_Hepg2_full_sequences.csv
[DATA] Saved splits in: data\splits\Data_Hepg2


In [4]:
PWM = load_or_create_ctcf_pwm(
    out_path=PWM_PATH,
    jaspar_id=PWM_ID,
    pseudocount=PWM_PSEUDOCOUNT,
)


In [5]:
print(f"Datasets processed: {len(PAPER_DATASETS)}")
print(f"Train subsample size: {TRAIN_N}")
print(f"PWM length: {PWM.shape[1]}")


Datasets processed: 3
Train subsample size: 100000
PWM length: 19
