## ISAAC â€“ Model Training Pipeline

Run `1-train-models.ipynb` to:

- Train DeepBind, BPNet, and DeepSEA models on TF binding datasets
- Save trained model checkpoints and training metrics
- Reproduce the predictive models audited in the paper


In [1]:
import torch
from pathlib import Path
import pandas as pd
import torch.nn as nn
from tqdm.auto import tqdm

from src.training.model_factory import build_model
from src.training.datasets import DNACNNDataset
from src.training.loops import train_epoch_cnn, freeze_model
from src.training.utils import set_seed, make_loader

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", DEVICE)


Using device: cpu


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DATASETS = ["Data_A549", "Data_GM12878", "Data_Hepg2"]
MODELS   = ["DeepBind", "BPNet", "DeepSEA"]

EPOCHS = {
    "DeepBind": 20,
    "BPNet": 20,
    "DeepSEA": 20,
}

SEEDS = [0, 1, 2, 3, 4]
BATCH_SIZE = 64
LR = 1e-3


In [3]:
DATA_DIR = Path("data/splits")
RESULTS_DIR = Path("results/tf_gene/training")
RESULTS_DIR.mkdir(parents=True, exist_ok=True)


In [None]:
# ============================================================
# FULL TRAINING LOOP (used to produce paper results)
# ============================================================

for dataset in DATASETS:

    split_dir = DATA_DIR / dataset
    assert split_dir.exists(), f"Missing split dir: {split_dir}"

    train_path = split_dir / "train_subsampled.csv"
    assert train_path.exists(), f"Missing train.csv in {split_dir}"

    train_df = pd.read_csv(train_path)

    print("\n===================================")
    print(f" DATASET: {dataset}")
    print("===================================")
    print(f"Training samples: {len(train_df)}")

    input_len = len(train_df["sequence_full"].iloc[0])

    for model_name in MODELS:

        n_epochs = EPOCHS[model_name]

        for seed in SEEDS:

            set_seed(seed)

            out_dir = RESULTS_DIR / dataset / model_name / f"seed_{seed}"
            out_dir.mkdir(parents=True, exist_ok=True)

            model_path = out_dir / "model.pt"

            if model_path.exists():
                print(
                    f"[SKIP] {dataset} | {model_name} | seed={seed} "
                    "(already trained)"
                )
                continue

            # --------------------------------------------------
            # Run header
            # --------------------------------------------------
            print(
                f"\n[TRAIN] {dataset} | {model_name} | seed={seed} "
                f"| epochs={n_epochs}"
            )

            # --------------------------------------------------
            # Dataset & loader
            # --------------------------------------------------
            ds = DNACNNDataset(train_df, view="sequence_full")
            loader = make_loader(ds, BATCH_SIZE)

            # --------------------------------------------------
            # Model, optimizer, loss
            # --------------------------------------------------
            model = build_model(
                model_name=model_name,
                input_length=input_len,
            ).to(DEVICE)

            opt  = torch.optim.Adam(model.parameters(), lr=LR)
            crit = nn.BCEWithLogitsLoss()

            losses = []

            # --------------------------------------------------
            # Training epochs
            # --------------------------------------------------
            for ep in tqdm(
                range(n_epochs),
                desc=f"{model_name} | seed={seed}",
                leave=False,
            ):
                loss = train_epoch_cnn(
                    model, loader, opt, crit, DEVICE
                )
                losses.append(loss)

            # --------------------------------------------------
            # Freeze & save
            # --------------------------------------------------
            freeze_model(model)
            torch.save(model.state_dict(), model_path)

            # --------------------------------------------------
            # Save metrics
            # --------------------------------------------------
            metrics = {
                "dataset": dataset,
                "model": model_name,
                "seed": seed,
                "epochs": n_epochs,
                "train_size": len(train_df),
                "final_train_loss": losses[-1],
                "train_loss_curve": losses,
            }

            with open(out_dir / "metrics.json", "w") as f:
                json.dump(metrics, f, indent=2)

            print(
                f"[DONE ] {dataset} | {model_name} | seed={seed} "
                f"| final loss={losses[-1]:.4f}"
            )

print("\nAll models trained.")


#### Execution policy

The training loop is not executed interactively due to its
computational cost.

This notebook documents the training procedure used in the paper and is
provided for transparency and reproducibility.
