# 03_quantum_kernel.ipynb ‚Äî QSVM (precomputed kernel)

This notebook computes a quantum kernel (via PennyLane) on PCA‚Äëreduced k‚Äëmer features and trains an SVM with the precomputed Gram matrix. Execute sequentially: Cell 0 (perf env) -> Cell 1 ... Cell 7.

# Cell 0 ‚Äî perf env

In [1]:
# Normalize thread usage for reproducible classical linear algebra performance
import os
os.environ.setdefault("OMP_NUM_THREADS", "8")
os.environ.setdefault("OPENBLAS_NUM_THREADS", "8")
os.environ.setdefault("MKL_NUM_THREADS", "8")
os.environ.setdefault("NUMEXPR_NUM_THREADS", "8")
print("BLAS threads:", os.environ.get("OMP_NUM_THREADS"), os.environ.get("OPENBLAS_NUM_THREADS"))

BLAS threads: 8 8


# Cell 1 ‚Äî imports, paths, journaling

In [2]:
# Imports: classical preprocessing + PennyLane for quantum kernel construction
from pathlib import Path
import json, warnings, time, os
import numpy as np, pandas as pd
import matplotlib.pyplot as plt

import pennylane as qml
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support, roc_auc_score, confusion_matrix,
    classification_report, balanced_accuracy_score, matthews_corrcoef, average_precision_score
)
from sklearn.svm import SVC

warnings.filterwarnings("ignore")

ROOT = Path(".")
PROCESSED = ROOT / "data" / "processed"
RESULTS = ROOT / "results"
(RESULTS / "kernels").mkdir(parents=True, exist_ok=True)
(RESULTS / "metrics").mkdir(parents=True, exist_ok=True)
(RESULTS / "plots").mkdir(parents=True, exist_ok=True)
(RESULTS / "logs").mkdir(parents=True, exist_ok=True)

np.random.seed(7)

# ---- Run journal (for documentation) ----
class RunJournal:
    def __init__(self): self.events = []
    def log(self, step, status, message, **extras):
        self.events.append({
            "ts": time.strftime("%Y-%m-%d %H:%M:%S"),
            "step": step, "status": status, "message": message, **extras
        })
        sym = "‚úÖ" if status=="ok" else ("‚ö†Ô∏è" if status=="warn" else "‚ùå")
        print(f"{sym} [{step}] {message}")
    def df(self): return pd.DataFrame(self.events)
    def save(self, base: Path):
        df = self.df()
        md = ["| ts | step | status | message |", "|---|---|---|---|"]
        for _,r in df.iterrows():
            md.append(f"| {r.ts} | {r.step} | {r.status} | {r.message} |")
        (base.with_suffix(".md")).write_text("\n".join(md), encoding="utf-8")
        (base.with_suffix(".json")).write_text(df.to_json(orient="records", indent=2), encoding="utf-8")
        print(f"üìù Saved journal:\n  - {base.with_suffix('.md')}\n  - {base.with_suffix('.json')}")

J = RunJournal()

# Cell 2 ‚Äî load data & PCA‚Üíangles (multi-dataset aware + logging)

In [3]:
# Try multi-dataset artifacts first; fall back to single-dataset artifacts
enc_candidates = [PROCESSED/"encodings_all.npz", PROCESSED/"encodings.npz"]
spl_candidates = [PROCESSED/"splits_pooled.json", PROCESSED/"splits.json"]

enc_path = next((p for p in enc_candidates if p.exists()), None)
spl_path = next((p for p in spl_candidates if p.exists()), None)

if enc_path is None or spl_path is None:
    if enc_path is None: J.log("load", "fail", "Encodings file not found (tried encodings_all.npz, encodings.npz)")
    if spl_path is None: J.log("load", "fail", "Splits file not found (tried splits_pooled.json, splits.json)")
    raise FileNotFoundError("Required data artifacts missing in data/processed")

data = np.load(enc_path, allow_pickle=True)
with open(spl_path) as f:
    SPL = json.load(f)
J.log("load", "ok", f"Loaded encodings from {enc_path.name} and splits from {spl_path.name}")

y = data["y"].astype(np.int64)
X_kmer = data["kmer"].astype(np.float32)

# Optional per-dataset index
ds_idx = data["ds_idx"] if "ds_idx" in data.files else None
ds_map = None
ds_map_path = PROCESSED/"dataset_index.csv"
if ds_idx is not None and ds_map_path.exists():
    ds_map = pd.read_csv(ds_map_path).set_index("ds_idx")["accession"].to_dict()
    J.log("datasets", "ok", f"Detected {len(set(ds_idx))} dataset(s) with mapping.")
elif ds_idx is not None:
    J.log("datasets", "warn", "ds_idx present but dataset_index.csv missing ‚Äî names unavailable.")

tr_idx = np.array(SPL["train"]); va_idx = np.array(SPL["val"]); te_idx = np.array(SPL["test"])
pos_rate = float(y.mean()) if len(y) else float("nan")
J.log("splits", "ok", f"train={len(tr_idx)}, val={len(va_idx)}, test={len(te_idx)}, pos_rate={pos_rate:.4f}")

# PCA ‚Üí standardize ‚Üí angle map
D = int(os.environ.get("QK_D", "8"))  # number of principal components (== number of qubits)
pca = PCA(n_components=D, random_state=7)
X_tr_p = pca.fit_transform(X_kmer[tr_idx])
X_va_p = pca.transform(X_kmer[va_idx])
X_te_p = pca.transform(X_kmer[te_idx])

ev = pca.explained_variance_ratio_.sum()
J.log("pca", "ok", f"PCA to D={D} components (variance explained={ev:.3f})")

scaler = StandardScaler(with_mean=True, with_std=True)
X_tr_z = scaler.fit_transform(X_tr_p)
X_va_z = scaler.transform(X_va_p)
X_te_z = scaler.transform(X_te_p)

# Angle map with clipping for robustness (document clipping rate)
def to_angles(X, clip=3.0):
    Xc = np.clip(X, -clip, clip)
    return (np.pi * Xc / clip).astype(np.float32), float((np.abs(X) > clip).sum())

Xtr, nclip_tr = to_angles(X_tr_z); Xva, nclip_va = to_angles(X_va_z); Xte, nclip_te = to_angles(X_te_z)
J.log("angles", "ok", f"Angle embedding created; clipped values ‚Äî train:{int(nclip_tr)}, val:{int(nclip_va)}, test:{int(nclip_te)}")

# --- Fast knobs (can be overridden via env) ---
MAX_TRAIN = int(os.environ.get("QK_MAX_TRAIN", "300"))     # number of train rows used to build anchors
N_ANCHORS = int(os.environ.get("QK_N_ANCHORS", "128"))     # Nystr√∂m anchors (<= MAX_TRAIN)
BATCH     = int(os.environ.get("QK_BATCH", "64"))          # kernel batch size for vectorized eval

‚úÖ [load] Loaded encodings from encodings_all.npz and splits from splits_pooled.json
‚úÖ [datasets] Detected 13 dataset(s) with mapping.
‚úÖ [splits] train=12336, val=4112, test=4112, pos_rate=0.8654
‚úÖ [pca] PCA to D=8 components (variance explained=0.614)
‚úÖ [angles] Angle embedding created; clipped values ‚Äî train:345, val:148, test:144


# Cell 3 ‚Äî device + kernel circuit (with device reporting)

In [4]:
# Define embedding circuit and kernel evaluation (overlap) using an adjoint construction
def make_device(n_wires, shots=None):
    try:
        dev_ = qml.device("lightning.qubit", wires=n_wires, shots=shots)
        J.log("device", "ok", f"Using lightning.qubit (wires={n_wires}, shots={shots})")
        return dev_
    except Exception as e:
        J.log("device", "warn", f"lightning.qubit unavailable ({e}); falling back to default.qubit")
        return qml.device("default.qubit", wires=n_wires, shots=shots)

n_wires = int(D)
wires = list(range(n_wires))
dev = make_device(n_wires, shots=None)

def cz_ring(ws):
    n = len(ws)
    for i in range(n):
        qml.CZ(wires=[ws[i], ws[(i+1) % n]])

def U(x):
    qml.AngleEmbedding(x, wires=wires, rotation="Y")
    cz_ring(wires)

@qml.qnode(dev)
def kernel_circuit(x1, x2):
    U(x1)
    qml.adjoint(U)(x2)
    # Fidelity with |0...0> equals |<phi(x1)|phi(x2)>|^2
    return qml.expval(qml.Projector([0]*n_wires, wires=wires))

‚úÖ [device] Using lightning.qubit (wires=8, shots=None)


# Cell 4 ‚Äî Gram matrices (symmetric speed-up + progress + saves)

In [5]:
# Build fast Gram surrogates using Nystr√∂m anchors + batched kernel calls
# Produces: K_trtr (M√óM), K_vatr (|val|√óM), K_tetr (|test|√óM) where M=N_ANCHORS

def make_device(n_wires, shots=None):
    try:
        dev_ = qml.device("lightning.qubit", wires=n_wires, shots=shots)
        J.log("device", "ok", f"Using lightning.qubit (wires={n_wires}, shots={shots})")
        return dev_
    except Exception as e:
        J.log("device", "warn", f"lightning.qubit unavailable ({e}); falling back to default.qubit")
        return qml.device("default.qubit", wires=n_wires, shots=shots)

n_wires = int(D)
wires = list(range(n_wires))
dev = make_device(n_wires, shots=None)

def cz_ring(ws):
    n = len(ws)
    for i in range(n):
        qml.CZ(wires=[ws[i], ws[(i+1) % n]])

def U(x):
    qml.AngleEmbedding(x, wires=wires, rotation="Y")
    cz_ring(wires)

@qml.qnode(dev)
def kpair(x1, x2):
    U(x1); qml.adjoint(U)(x2)
    return qml.expval(qml.Projector([0]*n_wires, wires=wires))

def kernel_block(XA, XB):
    """Compute |XA|√ó|XB| kernel in batches to reduce Python overhead."""
    m, n = len(XA), len(XB)
    K = np.empty((m, n), dtype=np.float64)
    for i0 in range(0, m, BATCH):
        i1 = min(i0 + BATCH, m)
        Xi = XA[i0:i1]
        # vectorize over XB within Python loop (still qnode-per-pair but fewer Python frames)
        for j in range(n):
            xbj = XB[j]
            for i, xi in enumerate(Xi):
                K[i0 + i, j] = kpair(xi, xbj)
        if (i1) % max(BATCH, 50) == 0 or i1 == m:
            print(f" rows {i0+1}-{i1}/{m} done")
    return K

# 1) Subsample training for anchor candidates (MAX_TRAIN)
sel_tr = tr_idx[: min(MAX_TRAIN, len(tr_idx))]
Xtr_sub = Xtr[:len(sel_tr)]
ytr_sub = y[sel_tr]
if len(sel_tr) < len(tr_idx):
    J.log("kernel", "warn", f"Train truncated to {len(sel_tr)} for speed.")

# 2) Choose Nystr√∂m anchors (uniform random or stratified)
rng = np.random.default_rng(7)
M = min(N_ANCHORS, len(Xtr_sub))
anc_idx_local = rng.choice(len(Xtr_sub), size=M, replace=False)
A = Xtr_sub[anc_idx_local]
J.log("kernel", "ok", f"Nystr√∂m anchors M={M} (from {len(Xtr_sub)}); batch={BATCH}")

# 3) Compute anchor blocks only (fast)
t0 = time.time()
K_MM  = kernel_block(A, A)                 # M√óM
K_trM = kernel_block(Xtr_sub, A)           # |tr_sub|√óM
K_vaM = kernel_block(Xva, A)               # |val|√óM
K_teM = kernel_block(Xte, A)               # |test|√óM
J.log("kernel", "ok", f"Computed Nystr√∂m blocks in {time.time()-t0:.1f}s")

# 4) Save (to keep notebook outputs the same file names as before, save under kernels/)
np.save(RESULTS/"kernels/K_MM.npy",  K_MM)
np.save(RESULTS/"kernels/K_trM.npy", K_trM)
np.save(RESULTS/"kernels/K_vaM.npy", K_vaM)
np.save(RESULTS/"kernels/K_teM.npy", K_teM)
np.save(RESULTS/"kernels/anchors_idx.npy", anc_idx_local)
print("saved Nystr√∂m blocks (K_MM, K_trM, K_vaM, K_teM, anchors_idx)")

# 5) Report what we did / could not do
if M < N_ANCHORS:
    J.log("limit", "warn", f"Requested {N_ANCHORS} anchors but only {M} available.")
if M < 32:
    J.log("limit", "warn", "Very small anchor set; metrics may be unstable.")

‚úÖ [device] Using lightning.qubit (wires=8, shots=None)
‚ö†Ô∏è [kernel] Train truncated to 300 for speed.
‚úÖ [kernel] Nystr√∂m anchors M=128 (from 300); batch=64
 rows 1-64/128 done
 rows 65-128/128 done
 rows 1-64/300 done
 rows 65-128/300 done
 rows 129-192/300 done
 rows 193-256/300 done
 rows 257-300/300 done
 rows 1-64/4112 done
 rows 65-128/4112 done
 rows 129-192/4112 done
 rows 193-256/4112 done
 rows 257-320/4112 done
 rows 321-384/4112 done
 rows 385-448/4112 done
 rows 449-512/4112 done
 rows 513-576/4112 done
 rows 577-640/4112 done
 rows 641-704/4112 done
 rows 705-768/4112 done
 rows 769-832/4112 done
 rows 833-896/4112 done
 rows 897-960/4112 done
 rows 961-1024/4112 done
 rows 1025-1088/4112 done
 rows 1089-1152/4112 done
 rows 1153-1216/4112 done
 rows 1217-1280/4112 done
 rows 1281-1344/4112 done
 rows 1345-1408/4112 done
 rows 1409-1472/4112 done
 rows 1473-1536/4112 done
 rows 1537-1600/4112 done
 rows 1601-1664/4112 done
 rows 1665-1728/4112 done
 rows 1729-1792/

# Cell 5 ‚Äî QSVM on precomputed kernel (full metrics, plots, docs)

In [6]:
# Train linear SVM on Nystr√∂m features  Œ¶ = K_{‚Ä¢M} K_MM^{-1/2}; produce same metrics/plots as before

from numpy.linalg import eigh

def nystrom_features(K_XM, K_MM, eps=1e-6):
    # K_MM = V diag(w) V^T  ->  K_MM^{-1/2}
    w, V = eigh(0.5*(K_MM + K_MM.T))
    W = 1.0 / np.sqrt(np.clip(w, eps, None))
    KMM_mhalf = V @ (np.diag(W)) @ V.T
    return K_XM @ KMM_mhalf

# Build feature maps
Phi_tr = nystrom_features(K_trM, K_MM)
Phi_va = nystrom_features(K_vaM, K_MM)
Phi_te = nystrom_features(K_teM, K_MM)

# Use the SAME labels as the rows used for K_trM (ytr_sub)
C = float(os.environ.get("QK_C", "5.0"))
clf = SVC(C=C, kernel="linear", probability=True, class_weight="balanced", random_state=0)
J.log("fit", "ok", f"QSVM(Nystr√∂m): linear SVM on Œ¶_tr (shape={Phi_tr.shape}), C={C}")
clf.fit(Phi_tr, ytr_sub)

# Validation threshold
from sklearn.metrics import f1_score
p_val = clf.predict_proba(Phi_va)[:,1]
thr_grid = np.linspace(0.05, 0.95, 37)
best_thr, best_f1 = 0.5, -1
for t in thr_grid:
    f1 = f1_score(y[va_idx], (p_val >= t).astype(int), zero_division=0)
    if f1 > best_f1:
        best_f1, best_thr = float(f1), float(t)
thr = best_thr
J.log("threshold", "ok", f"val-optimal thr={thr:.2f} (F1={best_f1:.3f})")

def extended_metrics(y_true, y_prob, thr):
    y_hat = (y_prob >= thr).astype(int)
    acc = accuracy_score(y_true, y_hat)
    prec, rec, f1, _ = precision_recall_fscore_support(y_true, y_hat, average="binary", zero_division=0)
    try: auc = roc_auc_score(y_true, y_prob)
    except: auc = float("nan")
    try: ap  = average_precision_score(y_true, y_prob)
    except: ap  = float("nan")
    cm = confusion_matrix(y_true, y_hat, labels=[0,1])
    tn, fp, fn, tp = cm.ravel()
    tnr = tn / (tn + fp) if (tn + fp) else float("nan")
    bal = balanced_accuracy_score(y_true, y_hat)
    mcc = matthews_corrcoef(y_true, y_hat) if len(np.unique(y_true))==2 else float("nan")
    rep = classification_report(y_true, y_hat, output_dict=True, zero_division=0)
    return {
        "acc": acc, "prec": prec, "rec": rec, "f1": f1,
        "roc_auc": auc, "pr_auc": ap, "specificity": tnr,
        "balanced_acc": bal, "mcc": mcc, "thr": thr,
        "tp": int(tp), "tn": int(tn), "fp": int(fp), "fn": int(fn),
        "support": int(len(y_true)),
    }, cm, rep

# Evaluate splits
rows, cms, reports = [], {}, {}
for split, (Phi, y_true) in {
    "train": (Phi_tr, ytr_sub),
    "val":   (Phi_va,  y[va_idx]),
    "test":  (Phi_te,  y[te_idx]),
}.items():
    p = clf.predict_proba(Phi)[:,1]
    m, cm, rep = extended_metrics(y_true, p, thr)
    m.update({"model":"QSVM_kernel_nystrom", "split":split, "M":int(Phi_tr.shape[1])})
    rows.append(m); cms[split]=cm; reports[split]=rep

df_metrics = pd.DataFrame(rows)
df_metrics.to_csv(RESULTS/"metrics/qsvm_kernel_metrics.csv", index=False)

# Confusion matrices (CSV)
def save_cm_csv(cm, out_csv, normalized=False):
    arr = cm.astype(np.float64)
    if normalized:
        rs = arr.sum(axis=1, keepdims=True)
        arr = np.divide(arr, np.where(rs==0, 1, rs))
    pd.DataFrame(arr, index=["true_0","true_1"], columns=["pred_0","pred_1"]).to_csv(out_csv, index=True)

for split, cm in cms.items():
    save_cm_csv(cm, RESULTS/f"metrics/qsvm_kernel_cm_{split}.csv", normalized=False)
    save_cm_csv(cm, RESULTS/f"metrics/qsvm_kernel_cm_{split}_norm.csv", normalized=True)

# ROC & PR for test
def plot_roc(y_true, y_prob, title, out_png):
    from sklearn.metrics import roc_curve, auc
    fpr, tpr, _ = roc_curve(y_true, y_prob)
    roc_auc = auc(fpr, tpr)
    plt.figure(); plt.plot(fpr, tpr, label=f"AUC={roc_auc:.3f}")
    plt.plot([0,1],[0,1],"--"); plt.xlabel("FPR"); plt.ylabel("TPR"); plt.title(title); plt.legend(); plt.tight_layout()
    plt.savefig(out_png, dpi=150); plt.close()
def plot_pr(y_true, y_prob, title, out_png):
    from sklearn.metrics import precision_recall_curve, average_precision_score
    prec, rec, _ = precision_recall_curve(y_true, y_prob)
    ap = average_precision_score(y_true, y_prob)
    plt.figure(); plt.plot(rec, prec, label=f"AP={ap:.3f}")
    plt.xlabel("Recall"); plt.ylabel("Precision"); plt.title(title); plt.legend(); plt.tight_layout()
    plt.savefig(out_png, dpi=150); plt.close()

p_test = clf.predict_proba(Phi_te)[:,1]
plot_roc(y[te_idx], p_test, "QSVM (Nystr√∂m) ‚Äî ROC (test)", RESULTS/"plots/qsvm_kernel_roc_test.png")
plot_pr (y[te_idx], p_test, "QSVM (Nystr√∂m) ‚Äî PR  (test)", RESULTS/"plots/qsvm_kernel_pr_test.png")

print(df_metrics[df_metrics["split"]=="test"].iloc[0].to_string())
J.log("eval", "ok", "QSVM(Nystr√∂m): metrics saved; see results/metrics and results/plots")

‚úÖ [fit] QSVM(Nystr√∂m): linear SVM on Œ¶_tr (shape=(300, 128)), C=5.0
‚úÖ [threshold] val-optimal thr=0.50 (F1=0.923)
acc                        0.865516
prec                       0.866228
rec                        0.998876
f1                         0.927835
roc_auc                    0.649885
pr_auc                     0.907214
specificity                0.007233
balanced_acc               0.503055
mcc                        0.047301
thr                             0.5
tp                             3555
tn                                4
fp                              549
fn                                4
support                        4112
model           QSVM_kernel_nystrom
split                          test
M                               128
‚úÖ [eval] QSVM(Nystr√∂m): metrics saved; see results/metrics and results/plots


In [7]:
# Persist Gram matrices derived from Nystrom features for downstream notebooks
K_trtr = Phi_tr @ Phi_tr.T
K_vatr = Phi_va @ Phi_tr.T
K_tetr = Phi_te @ Phi_tr.T

kern_dir = RESULTS / "kernels"
kern_dir.mkdir(parents=True, exist_ok=True)
np.save(kern_dir / "K_trtr.npy", K_trtr.astype(np.float32))
np.save(kern_dir / "K_vatr.npy", K_vatr.astype(np.float32))
np.save(kern_dir / "K_tetr.npy", K_tetr.astype(np.float32))
np.save(kern_dir / "train_indices.npy", sel_tr.astype(np.int64))
np.save(kern_dir / "val_indices.npy", va_idx.astype(np.int64))
np.save(kern_dir / "test_indices.npy", te_idx.astype(np.int64))
J.log("kernel_cache", "ok", f"Saved approximate Gram matrices (train={K_trtr.shape}, val={K_vatr.shape}, test={K_tetr.shape})")


‚úÖ [kernel_cache] Saved approximate Gram matrices (train=(300, 300), val=(4112, 300), test=(4112, 300))


# Cell 6 ‚Äî Save run journal (what worked, what didn‚Äôt, and why)

In [8]:
ts = time.strftime("%Y%m%d_%H%M%S")
base = RESULTS/"logs"/f"qsvm_kernel_{ts}"

issues = []
for e in J.events:
    if e["status"] in ("warn","fail"):
        issues.append(f"- [{e['step']}] {e['message']}")
rollup = "No warnings or failures." if not issues else "Issues observed:\n" + "\n".join(issues)
print("\n=== RUN SUMMARY ===\n" + rollup)

J.save(base)
(RESULTS/"logs"/f"qsvm_kernel_{ts}_summary.txt").write_text(rollup, encoding="utf-8")
print(f"üì¶ Metrics in: {RESULTS/'metrics'}  |  Plots in: {RESULTS/'plots'}  |  Kernels in: {RESULTS/'kernels'}")


=== RUN SUMMARY ===
Issues observed:
- [kernel] Train truncated to 300 for speed.
üìù Saved journal:
  - results\logs\qsvm_kernel_20250918_201239.md
  - results\logs\qsvm_kernel_20250918_201239.json
üì¶ Metrics in: results\metrics  |  Plots in: results\plots  |  Kernels in: results\kernels
