# 02_classical_baselines.ipynb ‚Äî SVMs 

This notebook runs classical SVM baselines on the prepared encodings. The BLAS/perf cell must be kept as-is and run first.

# Cell 0 ‚Äî perf env (keep as-is)

In [1]:
# Standardize BLAS thread usage to reduce run-to-run variability
import os
os.environ.setdefault("OMP_NUM_THREADS", "8")
os.environ.setdefault("OPENBLAS_NUM_THREADS", "8")
os.environ.setdefault("MKL_NUM_THREADS", "8")
os.environ.setdefault("NUMEXPR_NUM_THREADS", "8")
print("BLAS:", os.environ.get("OMP_NUM_THREADS"), os.environ.get("OPENBLAS_NUM_THREADS"))

BLAS: 8 8


# Cell 1 ‚Äî load data (robust, multi-dataset aware + journaling)

In [2]:
# Load encoded sequence representations and train/val/test indices
from pathlib import Path
import json, warnings, time
import numpy as np, pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support, roc_auc_score,
    confusion_matrix, classification_report, balanced_accuracy_score,
    matthews_corrcoef, average_precision_score
)
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")

PROCESSED = Path("data/processed")
RESULTS = Path("results")
(RESULTS/"metrics").mkdir(parents=True, exist_ok=True)
(RESULTS/"plots").mkdir(parents=True, exist_ok=True)
(RESULTS/"logs").mkdir(parents=True, exist_ok=True)

# ---- Run journal (for documentation) ----
class RunJournal:
    def __init__(self): self.events = []
    def log(self, step, status, message, **extras):
        self.events.append({
            "ts": time.strftime("%Y-%m-%d %H:%M:%S"),
            "step": step, "status": status, "message": message, **extras
        })
        sym = "‚úÖ" if status=="ok" else ("‚ö†Ô∏è" if status=="warn" else "‚ùå")
        print(f"{sym} [{step}] {message}")
    def df(self): return pd.DataFrame(self.events)
    def save(self, base: Path):
        df = self.df()
        md = ["| ts | step | status | message |", "|---|---|---|---|"]
        for _,r in df.iterrows():
            md.append(f"| {r.ts} | {r.step} | {r.status} | {r.message} |")
        (base.with_suffix(".md")).write_text("\n".join(md), encoding="utf-8")
        (base.with_suffix(".json")).write_text(df.to_json(orient="records", indent=2), encoding="utf-8")
        print(f"üìù Saved journal:\n  - {base.with_suffix('.md')}\n  - {base.with_suffix('.json')}")

J = RunJournal()

# ---- Resolve artifact filenames (new multi-dataset first; fallback to original) ----
enc_candidates = [PROCESSED/"encodings_all.npz", PROCESSED/"encodings.npz"]
spl_candidates = [PROCESSED/"splits_pooled.json", PROCESSED/"splits.json"]

enc_path = next((p for p in enc_candidates if p.exists()), None)
spl_path = next((p for p in spl_candidates if p.exists()), None)

if enc_path is None or spl_path is None:
    if enc_path is None: J.log("load", "fail", "Encodings file not found (tried encodings_all.npz, encodings.npz)")
    if spl_path is None: J.log("load", "fail", "Splits file not found (tried splits_pooled.json, splits.json)")
    raise FileNotFoundError("Required data artifacts missing in data/processed")

data = np.load(enc_path, allow_pickle=True)
with open(spl_path) as f:
    SPL = json.load(f)

J.log("load", "ok", f"Loaded encodings from {enc_path.name} and splits from {spl_path.name}")

# Arrays
y = data["y"]
X_kmer   = data["kmer"]
X_onehot = data["onehot"]
# Optional fields (multi-dataset)
ds_idx = data["ds_idx"] if "ds_idx" in data.files else None
dataset_index_csv = PROCESSED/"dataset_index.csv"
ds_map = None
if ds_idx is not None and dataset_index_csv.exists():
    ds_map = pd.read_csv(dataset_index_csv).set_index("ds_idx")["accession"].to_dict()
    J.log("datasets", "ok", f"Detected {len(set(ds_idx))} dataset(s) with mapping from dataset_index.csv")
elif ds_idx is not None:
    J.log("datasets", "warn", "ds_idx present but dataset_index.csv missing ‚Äî per-dataset names unavailable")

# Splits
tr, va, te = map(np.array, (SPL["train"], SPL["val"], SPL["test"]))
pos_rate = float(y.mean()) if len(y) else float("nan")
print("Split sizes:", len(tr), len(va), len(te), "| pos rate:", pos_rate)
J.log("splits", "ok", f"train={len(tr)}, val={len(va)}, test={len(te)}, pos_rate={pos_rate:.4f}")

‚úÖ [load] Loaded encodings from encodings_all.npz and splits from splits_pooled.json
‚úÖ [datasets] Detected 13 dataset(s) with mapping from dataset_index.csv
Split sizes: 12336 4112 4112 | pos rate: 0.8654182879377432
‚úÖ [splits] train=12336, val=4112, test=4112, pos_rate=0.8654


# Cell 2 ‚Äî evaluation helpers (full metrics + logging + saves)

In [3]:
# --- Utility: compute extended metrics from predictions ---
def extended_metrics(y_true, y_prob, thr):
    y_hat = (y_prob >= thr).astype(int)
    acc = accuracy_score(y_true, y_hat)
    prec, rec, f1, _ = precision_recall_fscore_support(y_true, y_hat, average="binary", zero_division=0)
    try:
        auc = roc_auc_score(y_true, y_prob)
    except Exception:
        auc = float("nan")
    try:
        ap = average_precision_score(y_true, y_prob)  # PR-AUC
    except Exception:
        ap = float("nan")
    cm = confusion_matrix(y_true, y_hat, labels=[0,1])
    tn, fp, fn, tp = cm.ravel()
    tnr = tn / (tn + fp) if (tn + fp) else float("nan")  # specificity
    bal_acc = balanced_accuracy_score(y_true, y_hat)
    mcc = matthews_corrcoef(y_true, y_hat) if len(np.unique(y_true))==2 else float("nan")
    rep = classification_report(y_true, y_hat, output_dict=True, zero_division=0)
    return {
        "acc": acc, "prec": prec, "rec": rec, "f1": f1,
        "roc_auc": auc, "pr_auc": ap, "specificity": tnr,
        "balanced_acc": bal_acc, "mcc": mcc, "thr": thr,
        "tp": int(tp), "tn": int(tn), "fp": int(fp), "fn": int(fn),
        "support": int(len(y_true)),
    }, cm, rep

# --- Threshold search on validation to maximize F1 ---
def choose_threshold(y_val, p_val, name="model"):
    grid = np.linspace(0.05, 0.95, 37)
    best_thr, best_f1 = 0.5, -1
    for t in grid:
        y_hat = (p_val >= t).astype(int)
        from sklearn.metrics import f1_score
        f1 = f1_score(y_val, y_hat, zero_division=0)
        if f1 > best_f1:
            best_f1, best_thr = float(f1), float(t)
    if np.isnan(best_f1):
        J.log("threshold", "warn", f"{name}: F1 undefined on val; using default thr=0.5")
        best_thr = 0.5
    else:
        J.log("threshold", "ok", f"{name}: selected thr={best_thr:.2f} (val F1={best_f1:.3f})")
    return best_thr

# --- Plotting helpers (1 figure per chart) ---
def plot_roc(y_true, y_prob, title, out_png):
    from sklearn.metrics import roc_curve, auc
    try:
        fpr, tpr, _ = roc_curve(y_true, y_prob)
        roc_auc = auc(fpr, tpr)
        plt.figure()
        plt.plot(fpr, tpr, label=f"AUC={roc_auc:.3f}")
        plt.plot([0,1],[0,1], linestyle="--")
        plt.xlabel("FPR")
        plt.ylabel("TPR")
        plt.title(title)
        plt.legend(loc="lower right")
        plt.tight_layout()
        plt.savefig(out_png, dpi=150)
        plt.close()
    except Exception as e:
        J.log("plot", "warn", f"ROC plot skipped: {e}")

def plot_pr(y_true, y_prob, title, out_png):
    from sklearn.metrics import precision_recall_curve, average_precision_score
    try:
        prec, rec, _ = precision_recall_curve(y_true, y_prob)
        ap = average_precision_score(y_true, y_prob)
        plt.figure()
        plt.plot(rec, prec, label=f"AP={ap:.3f}")
        plt.xlabel("Recall")
        plt.ylabel("Precision")
        plt.title(title)
        plt.legend(loc="lower left")
        plt.tight_layout()
        plt.savefig(out_png, dpi=150)
        plt.close()
    except Exception as e:
        J.log("plot", "warn", f"PR plot skipped: {e}")

def save_cm_csv(cm, out_csv, normalized=False):
    if normalized:
        cm = cm.astype(np.float64)
        row_sums = cm.sum(axis=1, keepdims=True)
        cm = np.divide(cm, np.where(row_sums==0, 1, row_sums))
    df = pd.DataFrame(cm, index=["true_0","true_1"], columns=["pred_0","pred_1"])
    df.to_csv(out_csv, index=True)

# Cell 3 ‚Äî SVM (k-mer) with full documentation outputs

In [4]:
# Radial basis SVM on normalized k-mer frequency vectors
model_name = "SVM_kmer"
svm_kmer = make_pipeline(
    StandardScaler(with_mean=True, with_std=True),
    SVC(C=5.0, kernel="rbf", gamma="scale", probability=True, class_weight="balanced", random_state=0)
)

J.log("fit", "ok", f"{model_name}: fitting on train (k-mer, n={len(tr)})")
svm_kmer.fit(X_kmer[tr], y[tr])

# Threshold selection on validation
p_val = svm_kmer.predict_proba(X_kmer[va])[:, 1]
thr = choose_threshold(y[va], p_val, name=model_name)

# Evaluate splits
split_data = {"train": (X_kmer[tr], y[tr]), "val": (X_kmer[va], y[va]), "test": (X_kmer[te], y[te])}
rows, reports = [], {}
cms = {}
for split, (Xsplit, ysplit) in split_data.items():
    p = svm_kmer.predict_proba(Xsplit)[:, 1]
    m, cm, rep = extended_metrics(ysplit, p, thr)
    m.update({"model": model_name, "split": split})
    rows.append(m); cms[split] = cm; reports[split] = rep

df_metrics = pd.DataFrame(rows)
df_metrics.to_csv(RESULTS / "metrics/svm_kmer_metrics.csv", index=False)

# Save confusion matrices (raw + normalized)
for split, cm in cms.items():
    save_cm_csv(cm, RESULTS / f"metrics/svm_kmer_cm_{split}.csv", normalized=False)
    save_cm_csv(cm, RESULTS / f"metrics/svm_kmer_cm_{split}_norm.csv", normalized=True)

# Save classification reports (per split)
with open(RESULTS / "metrics/svm_kmer_classification_reports.json", "w", encoding="utf-8") as f:
    json.dump(reports, f, indent=2)

# Plots (test split)
plot_roc(y[te], svm_kmer.predict_proba(X_kmer[te])[:, 1], f"{model_name} ‚Äî ROC (test)", RESULTS / "plots/svm_kmer_roc_test.png")
plot_pr (y[te], svm_kmer.predict_proba(X_kmer[te])[:, 1], f"{model_name} ‚Äî PR (test)",  RESULTS / "plots/svm_kmer_pr_test.png")

# Console + journal summary (avoid backslashes in f-strings)
row_test = df_metrics.loc[df_metrics["split"] == "test"].iloc[0]
print(row_test.to_string())

J.log(
    "eval", "ok",
    f"{model_name}: test F1={row_test['f1']:.3f}, "
    f"AUC={row_test['roc_auc']:.3f}, "
    f"PR-AUC={row_test['pr_auc']:.3f}, thr={thr:.2f}"
)

‚úÖ [fit] SVM_kmer: fitting on train (k-mer, n=12336)
‚úÖ [threshold] SVM_kmer: selected thr=0.40 (val F1=0.969)
acc             0.916829
prec            0.922956
rec             0.986232
f1              0.953545
roc_auc         0.801506
pr_auc          0.938734
specificity     0.470163
balanced_acc    0.728197
mcc             0.590643
thr                  0.4
tp                  3510
tn                   260
fp                   293
fn                    49
support             4112
model           SVM_kmer
split               test
‚úÖ [eval] SVM_kmer: test F1=0.954, AUC=0.802, PR-AUC=0.939, thr=0.40


# Cell 4 ‚Äî SVM (one-hot flattened) with full documentation outputs

In [5]:
# RBF SVM on flattened one-hot sequence encoding (high-dimensional)
model_name = "SVM_onehot"
X_flat = X_onehot.reshape(len(X_onehot), -1).astype(np.float32)

svm_1hot = make_pipeline(
    StandardScaler(with_mean=True, with_std=True),
    SVC(C=2.0, kernel="rbf", gamma="scale", probability=True, class_weight="balanced", random_state=0)
)

J.log("fit", "ok", f"{model_name}: fitting on train (one-hot flat, n={len(tr)}, d={X_flat.shape[1]})")
svm_1hot.fit(X_flat[tr], y[tr])

# Threshold selection on validation
p_val = svm_1hot.predict_proba(X_flat[va])[:, 1]
thr = choose_threshold(y[va], p_val, name=model_name)

# Evaluate splits
split_data = {"train": (X_flat[tr], y[tr]), "val": (X_flat[va], y[va]), "test": (X_flat[te], y[te])}
rows, reports = [], {}
cms = {}
for split, (Xsplit, ysplit) in split_data.items():
    p = svm_1hot.predict_proba(Xsplit)[:, 1]
    m, cm, rep = extended_metrics(ysplit, p, thr)
    m.update({"model": model_name, "split": split})
    rows.append(m); cms[split] = cm; reports[split] = rep

df_metrics = pd.DataFrame(rows)
df_metrics.to_csv(RESULTS / "metrics/svm_onehot_metrics.csv", index=False)

# Save confusion matrices (raw + normalized)
for split, cm in cms.items():
    save_cm_csv(cm, RESULTS / f"metrics/svm_onehot_cm_{split}.csv", normalized=False)
    save_cm_csv(cm, RESULTS / f"metrics/svm_onehot_cm_{split}_norm.csv", normalized=True)

# Save classification reports (per split)
with open(RESULTS / "metrics/svm_onehot_classification_reports.json", "w", encoding="utf-8") as f:
    json.dump(reports, f, indent=2)

# Plots (test split)
plot_roc(y[te], svm_1hot.predict_proba(X_flat[te])[:, 1], f"{model_name} ‚Äî ROC (test)", RESULTS / "plots/svm_onehot_roc_test.png")
plot_pr (y[te], svm_1hot.predict_proba(X_flat[te])[:, 1], f"{model_name} ‚Äî PR (test)",  RESULTS / "plots/svm_onehot_pr_test.png")

# Console + journal summary (avoid backslashes in f-strings)
row_test = df_metrics.loc[df_metrics["split"] == "test"].iloc[0]
print(row_test.to_string())

J.log(
    "eval", "ok",
    f"{model_name}: test F1={row_test['f1']:.3f}, "
    f"AUC={row_test['roc_auc']:.3f}, "
    f"PR-AUC={row_test['pr_auc']:.3f}, thr={thr:.2f}"
)

‚úÖ [fit] SVM_onehot: fitting on train (one-hot flat, n=12336, d=1024)
‚úÖ [threshold] SVM_onehot: selected thr=0.30 (val F1=0.978)
acc               0.917558
prec              0.917315
rec                0.99438
f1                0.954294
roc_auc           0.842139
pr_auc            0.956038
specificity       0.423146
balanced_acc      0.708763
mcc               0.591716
thr                    0.3
tp                    3539
tn                     234
fp                     319
fn                      20
support               4112
model           SVM_onehot
split                 test
‚úÖ [eval] SVM_onehot: test F1=0.954, AUC=0.842, PR-AUC=0.956, thr=0.30


# (Optional) Cell 5 ‚Äî Per-dataset diagnostics (if ds_idx available)

In [6]:
if ds_idx is None:
    J.log("per-dataset", "warn", "ds_idx not found in encodings ‚Äî skipping per-dataset evaluation.")
else:
    # Use pooled best thresholds (per model) to measure generalization per dataset on TEST split only
    def per_dataset_eval(model, X_all, model_name, thr):
        rows = []
        uniq = sorted(np.unique(ds_idx))
        for d in uniq:
            name = ds_map.get(d, f"ds_{d}") if ds_map else f"ds_{d}"
            mask = (ds_idx[te] == d)  # restrict to test subset for that dataset
            if mask.sum() == 0:
                continue
            y_true = y[te][mask]
            y_prob = model.predict_proba(X_all[te][mask])[:,1]
            m, cm, _ = extended_metrics(y_true, y_prob, thr)
            m.update({"dataset": name, "n": int(mask.sum()), "model": model_name})
            rows.append(m)
        return pd.DataFrame(rows)

    # Reuse thresholds chosen earlier
    # For SVM_kmer
    thr_kmer = pd.read_csv(RESULTS/"metrics/svm_kmer_metrics.csv").query("split=='val'")["thr"].values[0]
    df_per_kmer = per_dataset_eval(svm_kmer, X_kmer, "SVM_kmer", thr_kmer)
    df_per_kmer.to_csv(RESULTS/"metrics/svm_kmer_per_dataset_test.csv", index=False)
    J.log("per-dataset", "ok", f"SVM_kmer per-dataset metrics saved for {len(df_per_kmer)} dataset(s).")

    # For SVM_onehot
    thr_1hot = pd.read_csv(RESULTS/"metrics/svm_onehot_metrics.csv").query("split=='val'")["thr"].values[0]
    df_per_1hot = per_dataset_eval(svm_1hot, X_flat, "SVM_onehot", thr_1hot)
    df_per_1hot.to_csv(RESULTS/"metrics/svm_onehot_per_dataset_test.csv", index=False)
    J.log("per-dataset", "ok", f"SVM_onehot per-dataset metrics saved for {len(df_per_1hot)} dataset(s).")

‚úÖ [per-dataset] SVM_kmer per-dataset metrics saved for 13 dataset(s).
‚úÖ [per-dataset] SVM_onehot per-dataset metrics saved for 13 dataset(s).


# Cell 6 ‚Äî Save run journal (what worked, what didn‚Äôt, and why)

In [7]:
ts = time.strftime("%Y%m%d_%H%M%S")
base = RESULTS/"logs"/f"classical_baselines_{ts}"
# Add a brief ‚Äúlimitations / problems seen‚Äù roll-up for your documentation
issues = []
for e in J.events:
    if e["status"] in ("warn","fail"):
        issues.append(f"- [{e['step']}] {e['message']}")
rollup = "No warnings or failures." if not issues else "Issues observed:\n" + "\n".join(issues)
print("\n=== RUN SUMMARY ===\n" + rollup)

J.save(base)
(RESULTS/"logs"/f"classical_baselines_{ts}_summary.txt").write_text(rollup, encoding="utf-8")
print(f"üì¶ Metrics in: {RESULTS/'metrics'}  |  Plots in: {RESULTS/'plots'}")


=== RUN SUMMARY ===
üìù Saved journal:
  - results\logs\classical_baselines_20250918_105341.md
  - results\logs\classical_baselines_20250918_105341.json
üì¶ Metrics in: results\metrics  |  Plots in: results\plots
