## ISAAC - Audit pipeline

Run `2-isaac-audit.ipynb` to:

- Filter models by AUROC
- Run robust ISAAC audit with repeated balanced subsampling
- Save raw Î”S and paper-ready tables and figures


In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from itertools import product
from tqdm.auto import tqdm

from src.audit.isaac_core import *
from src.audit.isaac_reporting import *
from src.audit.isaac_pipeline_utils import *


In [None]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

DATASETS = ["Data_A549", "Data_GM12878", "Data_Hepg2"]
MODELS   = ["DeepBind", "BPNet", "DeepSEA"]
SEEDS    = [0, 1, 2, 3, 4]

AUROC_MIN  = 0.6
FILTER_MODE = "mean"

ROBUST = {
    "subsample_size": 5000,   # size of each balanced subsample
    "n_iterations": 10,       # number of repeated subsampling iterations
    "batch_size": 128,        # batch size for model inference only
}

SEED = 0
K_IC = 4

DATA_DIR  = Path("data/splits")
TRAIN_DIR = Path("results/tf_gene/training")
AUDIT_DIR = Path("results/tf_gene/audit")
AUDIT_DIR.mkdir(parents=True, exist_ok=True)

PWM_PATH = "data/processed/ctcf_pwm.npy"


In [None]:
from itertools import product
from tqdm.auto import tqdm

# ==================================================
# STORAGE
# ==================================================
all_results   = []
all_resampling = []
all_auroc     = []

# ==================================================
# WORKLIST = 1 progress bar = 1 stato chiaro
# ==================================================
work = list(product(DATASETS, MODELS, SEEDS))

pbar = tqdm(
    work,
    desc="ISAAC audit",
    unit="run",
    dynamic_ncols=True,
)

# ==================================================
# MAIN LOOP
# ==================================================
for dataset, model, seed in pbar:

    pbar.set_postfix({
        "dataset": dataset,
        "model": model,
        "seed": seed,
    })

    # --------------------------------------------------
    # LOAD DATASET 
    # --------------------------------------------------
    if seed == SEEDS[0] and model == MODELS[0]:
        df = pd.read_csv(DATA_DIR / dataset / "test.csv")
        sequences_full = df.sequence_full.tolist()
        labels_full    = df.label.values

        audit_csv = AUDIT_DIR / f"audit_set_{dataset}.csv"

        audit_seqs, audit_labels = create_and_save_audit_set(
            sequences_full,
            labels_full,
            n_samples=len(sequences_full),
            seed=SEED,
            out_path=audit_csv,
        )

        # structural prior + interventions
        PWM = np.load(PWM_PATH)
        prior = RegulatoryStructuralPrior(
            pwm=PWM,
            ic_core=compute_ic_core(PWM, K_IC)
        )
        mech_fn, spur_fn = build_interventions(prior, use_deterministic=True)

        # cache 
        CURRENT_AUDIT = {
            "seqs": audit_seqs,
            "labels": audit_labels,
            "PWM": PWM,
            "mech_fn": mech_fn,
            "spur_fn": spur_fn,
        }

    # --------------------------------------------------
    # LOAD MODEL
    # --------------------------------------------------
    model_path = TRAIN_DIR / dataset / model / f"seed_{seed}" / "model.pt"
    if not model_path.exists():
        continue

    logits_fn = load_model_and_logits(
        model,
        model_path,
        len(CURRENT_AUDIT["seqs"][0]),
        DEVICE,
    )

    # --------------------------------------------------
    # AUROC 
    # --------------------------------------------------
    auroc = compute_auroc(
        CURRENT_AUDIT["seqs"],
        CURRENT_AUDIT["labels"],
        logits_fn,
        ROBUST["batch_size"],
    )

    all_auroc.append({
        "dataset": dataset,
        "model": model,
        "seed": seed,
        "auroc": auroc,
    })

    # --------------------------------------------------
    # FILTER (per-seed)
    # --------------------------------------------------
    if auroc < AUROC_MIN:
        continue

    # --------------------------------------------------
    # ROBUST ISAAC AUDIT 
    # --------------------------------------------------
    results = isaac_audit(
        sequences=CURRENT_AUDIT["seqs"],
        labels=CURRENT_AUDIT["labels"],
        model_logits_batch=logits_fn,
        mech_interventions=CURRENT_AUDIT["mech_fn"],
        spur_interventions=CURRENT_AUDIT["spur_fn"],
        pwm=CURRENT_AUDIT["PWM"],
        subsample_size=ROBUST["subsample_size"],
        n_iterations=ROBUST["n_iterations"],
        batch_size=ROBUST["batch_size"],
        seed=seed,
        return_distributions=True,
    )

    # --------------------------------------------------
    # COLLECT RESULTS
    # --------------------------------------------------
    all_results.extend(
        extract_isaac_results(
            results,
            dataset,
            model,
            seed,
            auroc,
            ROBUST,
        )
    )

    all_resampling.extend(
        extract_resampling_distributions(
            results,
            dataset,
            model,
            seed,
        )
    )

# ==================================================
# SAVE 
# ==================================================
pd.DataFrame(all_results).to_csv(
    AUDIT_DIR / "isaac_results_ALL.csv",
    index=False,
)

pd.DataFrame(all_auroc).to_csv(
    AUDIT_DIR / "auroc_ALL.csv",
    index=False,
)

pd.DataFrame(all_resampling).to_csv(
    AUDIT_DIR / "isaac_resampling_ALL.csv",
    index=False,
)

print("ISAAC audit completed cleanly.")


In [None]:
df_results = pd.read_csv(AUDIT_DIR / "isaac_results_ALL.csv")
df_resamp    = pd.read_csv(AUDIT_DIR / "isaac_resampling_ALL.csv")
df_auroc   = pd.read_csv(AUDIT_DIR / "auroc_ALL.csv")


In [None]:
plot_isaac(
    df_resampling=df_resamp,
    dataset="Data_A549",
    savepath="figures/isaac_A549"
)


In [None]:
isaac_table = make_isaac_table_aggregated(df_results)
isaac_table


#### Execution policy

The ISAAC audit requires repeated large-scale model inference and
label-balanced repeated subsampling, and is therefore not intended for
routine interactive execution.

This notebook documents the auditing procedure used in the paper and is
provided for transparency and reproducibility.
