# 02_classical_baselines.ipynb — SVMs

# Cell 0 — perf env

In [1]:
# Standardize BLAS thread usage to reduce run-to-run variability
import os
os.environ.setdefault("OMP_NUM_THREADS", "8")
os.environ.setdefault("OPENBLAS_NUM_THREADS", "8")
os.environ.setdefault("MKL_NUM_THREADS", "8")
os.environ.setdefault("NUMEXPR_NUM_THREADS", "8")
print("BLAS:", os.environ.get("OMP_NUM_THREADS"), os.environ.get("OPENBLAS_NUM_THREADS"))

BLAS: 8 8


# Cell 1 — load data

In [2]:
# Load encoded sequence representations and train/val/test indices
from pathlib import Path
import json, warnings
import numpy as np, pandas as pd

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, confusion_matrix

warnings.filterwarnings("ignore")

PROCESSED = Path("data/processed")
RESULTS = Path("results"); (RESULTS/"metrics").mkdir(parents=True, exist_ok=True)

data = np.load(PROCESSED/"encodings.npz", allow_pickle=True)
with open(PROCESSED/"splits.json") as f:
    SPL = json.load(f)

y = data["y"]
X_kmer   = data["kmer"]
X_onehot = data["onehot"]

tr, va, te = map(np.array, (SPL["train"], SPL["val"], SPL["test"]))
print("Split sizes:", len(tr), len(va), len(te), "| pos rate:", float(y.mean()))

Split sizes: 894 298 298 | pos rate: 0.9228187919463087


# Cell 2 — eval helper (val-optimal threshold)

In [3]:
# Fit classifier; choose decision threshold maximizing validation F1; return per-split metrics & probabilities
def eval_with_threshold(pipe, X_tr, y_tr, X_va, y_va, X_te, y_te, name="svm"):
    pipe.fit(X_tr, y_tr)
    p_va = pipe.predict_proba(X_va)[:,1]
    thr_grid = np.linspace(0.1, 0.9, 33)
    best_thr, best_f1 = 0.5, -1.0
    from sklearn.metrics import f1_score
    for t in thr_grid:
        f1 = f1_score(y_va, (p_va >= t).astype(int), zero_division=0)
        if f1 > best_f1:
            best_thr, best_f1 = float(t), float(f1)

    def pack(X, y, split):
        p = pipe.predict_proba(X)[:,1]
        yhat = (p >= best_thr).astype(int)
        acc = accuracy_score(y, yhat)
        prec, rec, f1, _ = precision_recall_fscore_support(y, yhat, average="binary", zero_division=0)
        try:
            auc = roc_auc_score(y, p)
        except Exception:
            auc = float("nan")
        cm = confusion_matrix(y, yhat)
        return dict(model=name, split=split, acc=acc, prec=prec, rec=rec, f1=f1, auc=auc, thr=best_thr), cm, p

    m_tr, cm_tr, p_tr = pack(X_tr, y_tr, "train")
    m_va, cm_va, p_va = pack(X_va, y_va, "val")
    m_te, cm_te, p_te = pack(X_te, y_te, "test")
    return [m_tr, m_va, m_te], {"train":cm_tr, "val":cm_va, "test":cm_te}, {"train":p_tr, "val":p_va, "test":p_te}, best_thr

# Cell 3 — SVM (k-mer)

In [4]:
# Radial basis SVM on normalized k-mer frequency vectors
svm_kmer = make_pipeline(
    StandardScaler(with_mean=True, with_std=True),
    SVC(C=5.0, kernel="rbf", gamma="scale", probability=True, class_weight="balanced", random_state=0)
)
metrics_k, cms_k, probs_k, thr_k = eval_with_threshold(svm_kmer, X_kmer[tr], y[tr], X_kmer[va], y[va], X_kmer[te], y[te], name="SVM_kmer")
pd.DataFrame(metrics_k).to_csv(RESULTS/"metrics/svm_kmer.csv", index=False)
metrics_k[-1], thr_k

({'model': 'SVM_kmer',
  'split': 'test',
  'acc': 0.9194630872483222,
  'prec': 0.9225589225589226,
  'rec': 0.9963636363636363,
  'f1': 0.958041958041958,
  'auc': 0.7743873517786563,
  'thr': 0.55},
 0.55)

# Cell 4 — SVM (one-hot flattened)

In [5]:
# RBF SVM on flattened one-hot sequence encoding (high-dimensional sparse-ish features)
X_flat = X_onehot.reshape(len(X_onehot), -1).astype(np.float32)
svm_1hot = make_pipeline(
    StandardScaler(with_mean=True, with_std=True),
    SVC(C=2.0, kernel="rbf", gamma="scale", probability=True, class_weight="balanced", random_state=0)
)
metrics_o, cms_o, probs_o, thr_o = eval_with_threshold(svm_1hot, X_flat[tr], y[tr], X_flat[va], y[va], X_flat[te], y[te], name="SVM_onehot")
pd.DataFrame(metrics_o).to_csv(RESULTS/"metrics/svm_onehot_flat.csv", index=False)
metrics_o[-1], thr_o

({'model': 'SVM_onehot',
  'split': 'test',
  'acc': 0.9228187919463087,
  'prec': 0.9228187919463087,
  'rec': 1.0,
  'f1': 0.9598603839441536,
  'auc': 0.7560474308300394,
  'thr': 0.1},
 0.1)