# 07_reporting.ipynb — improved, explainable reporting

# Cell 0 — perf env

In [1]:
import os
os.environ.setdefault("OMP_NUM_THREADS", "8")
os.environ.setdefault("OPENBLAS_NUM_THREADS", "8")
os.environ.setdefault("MKL_NUM_THREADS", "8")
os.environ.setdefault("NUMEXPR_NUM_THREADS", "8")
'8'

'8'

# Cell 1 — load metrics, sweeps, caches, helpers (enhanced)

In [2]:
from pathlib import Path
import json, sys, platform, warnings, math, re
import numpy as np, pandas as pd, matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve, average_precision_score, confusion_matrix

warnings.filterwarnings("ignore")

ROOT = Path(".")
RESULTS = ROOT/"results"
METRICS = RESULTS/"metrics"
FIGS    = RESULTS/"figures"
TABLES  = RESULTS/"tables"
REPORT  = RESULTS/"report"
LOGS    = RESULTS/"logs"
for p in [FIGS, TABLES, REPORT]:
    p.mkdir(parents=True, exist_ok=True)

def safe_load_csv(path: Path) -> pd.DataFrame:
    if not path.exists():
        print("! missing:", path)
        return pd.DataFrame()
    try:
        return pd.read_csv(path)
    except Exception as e:
        print("! could not read:", path, e)
        return pd.DataFrame()

def safe_load_json(path: Path, default=None):
    if not path.exists(): return default
    try:
        with open(path, "r", encoding="utf-8") as f: return json.load(f)
    except Exception: return default

# ---- Core metrics from all notebooks ----
combined = safe_load_csv(METRICS/"combined.csv")            # 06 notebook consolidated
m_qsvm   = safe_load_csv(METRICS/"qsvm_kernel_metrics.csv") # 03 QSVM precomputed kernel
m_vqc    = safe_load_csv(METRICS/"vqc.csv")                 # 04 VQC (if exported there)
m_svmk   = safe_load_csv(METRICS/"svm_kmer.csv") if (METRICS/"svm_kmer.csv").exists() else pd.DataFrame()
m_svm1   = safe_load_csv(METRICS/"svm_onehot_flat.csv") if (METRICS/"svm_onehot_flat.csv").exists() else pd.DataFrame()
ns_vqc   = safe_load_csv(METRICS/"noise_sweep_vqc.csv")
ns_qsvm  = safe_load_csv(METRICS/"noise_sweep_qsvm.csv")

# ---- Caches for ROC/CM ----
ROC_DIR = RESULTS / "roc_cache"
CM_DIR  = RESULTS / "cm_cache"

# ---- Optional run-timing report (from 06 sweep) ----
timing = {}
rr = METRICS / "noise_sweep_run_report.json"
if rr.exists():
    timing = safe_load_json(rr, {}).get("timing", {})

# ---- Dataset/meta (optional) ----
enc_meta = safe_load_json(ROOT/"data/processed/meta.json", {})
appx_env = safe_load_json(RESULTS/"appendix/environment.json", {})

# ---- McNemar tests (optional) ----
mcnemar_stats = safe_load_json(RESULTS/"stats/mcnemar.json", {})

# ---- Gather RunJournal events across notebooks (printable limitations/problems) ----
def collect_journal_events(log_root: Path) -> pd.DataFrame:
    rows = []
    if not log_root.exists(): return pd.DataFrame(columns=["ts","step","status","message","source"])
    for p in sorted(log_root.glob("**/*.json")):
        if not re.search(r"(qsvm|noise|vqc|kernel|baseline|analysis|robustness)", p.name, re.I):
            continue
        data = safe_load_json(p, [])
        if isinstance(data, list):
            for r in data:
                r = {**r, "source": p.name}
                rows.append(r)
    if not rows: 
        return pd.DataFrame(columns=["ts","step","status","message","source"])
    df = pd.DataFrame(rows)
    keep = [c for c in ["ts","step","status","message","source"] if c in df.columns]
    df = df[keep].sort_values("ts").reset_index(drop=True)
    return df

journal = collect_journal_events(LOGS)

# ---- Helpers ----
def pick_test(df: pd.DataFrame) -> pd.DataFrame:
    if df.empty: return df
    if "split" in df.columns:
        return df[df["split"].str.lower()=="test"].copy()
    return df

def ensure_columns(df, cols):
    out = df.copy()
    for c in cols:
        if c not in out.columns: out[c] = np.nan
    return out

# ---- Build unified test table (one row per model) ----
cands = []
if not combined.empty:
    cands.append(pick_test(combined))
for name, df in [("QSVM_kernel", m_qsvm), ("VQC", m_vqc), ("SVM_kmer", m_svmk), ("SVM_onehot", m_svm1)]:
    if not df.empty:
        z = df.copy()
        if "model" not in z.columns:
            z["model"] = name
        cands.append(pick_test(z))

if not cands:
    raise RuntimeError("No metrics found. Run 02/03/04/06 first.")

test_all = pd.concat(cands, ignore_index=True)
keep = ["model","acc","prec","rec","f1","auc","pr_auc","specificity","balanced_acc","mcc"]
test_all = ensure_columns(test_all, keep)

test_all = (test_all[keep]
            .dropna(subset=["model"])
            .drop_duplicates(subset=["model"], keep="last")
            .sort_values("f1", ascending=False)
            .reset_index(drop=True))

test_all.to_csv(METRICS/"summary_test.csv", index=False)

# ---- Deltas vs baseline (SVM_kmer), if present ----
if "SVM_kmer" in test_all["model"].values:
    base = test_all.set_index("model").loc["SVM_kmer"]
    for m in test_all["model"].values:
        if m == "SVM_kmer": continue
        for k in ["acc","prec","rec","f1","auc","pr_auc","specificity","balanced_acc","mcc"]:
            v = test_all.loc[test_all["model"]==m, k].values[0]
            test_all.loc[test_all["model"]==m, f"Δ{k}"] = (v - base[k]) if (not np.isnan(v) and not np.isnan(base[k])) else np.nan

# ---- Best noise configs ----
def best_noise(df: pd.DataFrame, topn=5):
    if df.empty: return pd.DataFrame()
    t = df[df["split"]=="test"].copy()
    if t.empty: return pd.DataFrame()
    cols = [c for c in ["shots","pflip","pdepol","anchors","S_NOISE"] if c in t.columns]
    return (t.groupby(cols)["f1"].mean().reset_index().sort_values("f1", ascending=False).head(topn))

best_vqc  = best_noise(ns_vqc, 5)
best_qsvm = best_noise(ns_qsvm, 5)

print("Summary rows:", len(test_all))
display(test_all)
print("\nBest VQC noise configs:");  display(best_vqc)
print("\nBest QSVM noise configs:"); display(best_qsvm)

Summary rows: 5


Unnamed: 0,model,acc,prec,rec,f1,auc,pr_auc,specificity,balanced_acc,mcc,Δacc,Δprec,Δrec,Δf1,Δauc,Δpr_auc,Δspecificity,Δbalanced_acc,Δmcc
0,VQC,0.922819,0.922819,1.0,0.95986,0.510988,,,,,0.003356,0.00026,0.003636,0.001818,-0.263399,,,,
1,SVM_onehot,0.922819,0.922819,1.0,0.95986,0.756047,,,,,0.003356,0.00026,0.003636,0.001818,-0.01834,,,,
2,SVM_kmer,0.919463,0.922559,0.996364,0.958042,0.774387,,,,,,,,,,,,,
3,QSVM_kernel,0.88255,0.888514,0.992453,0.937611,,0.898302,0.0,0.496226,-0.029007,-0.036913,-0.034045,-0.003911,-0.020431,,,,,
4,QSVM_kernel_nystrom,0.865516,0.866228,0.998876,0.927835,,0.907214,0.007233,0.503055,0.047301,-0.053948,-0.056331,0.002512,-0.030207,,,,,



Best VQC noise configs:


Unnamed: 0,shots,pflip,pdepol,f1
0,256,0.0,0.0,0.777691
1,256,0.01,0.0,0.776803



Best QSVM noise configs:


Unnamed: 0,shots,pflip,pdepol,anchors,S_NOISE,f1
0,256,0.01,0.0,96.0,16.0,0.921235


# Cell 2 — LaTeX table + richer figures (ROC, PR, CM, bars, noise heatmaps)

In [1]:
# --- LaTeX export (richer columns if available) ---
latex_cols = [c for c in ["acc","prec","rec","f1","auc","pr_auc","specificity","balanced_acc","mcc"] if c in test_all.columns]
latex = test_all[["model"]+latex_cols].rename(columns={
    "acc":"Accuracy","prec":"Precision","rec":"Recall","f1":"F1","auc":"ROC-AUC",
    "pr_auc":"PR-AUC","specificity":"Specificity","balanced_acc":"Balanced-Acc","mcc":"MCC"
}).set_index("model")
with open(TABLES/"summary_test.tex","w") as f:
    f.write(latex.to_latex(float_format="%.3f", escape=True))
print("Wrote:", TABLES/"summary_test.tex")

# --- ROC & PR curves from cache (robust to different lengths) ---
from sklearn.metrics import precision_recall_curve, average_precision_score

def _plot_family(items, y_true, out_png, kind="roc"):
    fig, ax = plt.subplots(figsize=(6,5))
    for label, probs in items:
        try:
            if kind == "roc":
                fpr, tpr, _ = roc_curve(y_true, probs); auc = roc_auc_score(y_true, probs)
                ax.plot(fpr, tpr, label=f"{label} (AUC={auc:.3f})")
            else:
                prec, rec, _ = precision_recall_curve(y_true, probs); ap = average_precision_score(y_true, probs)
                ax.plot(rec, prec, label=f"{label} (AP={ap:.3f})")
        except Exception as e:
            print(f"{kind.upper()} for {label} skipped: {e}")
    if kind == "roc": ax.plot([0,1],[0,1],"--", lw=1)
    ax.set_xlabel("False Positive Rate" if kind=="roc" else "Recall")
    ax.set_ylabel("True Positive Rate" if kind=="roc" else "Precision")
    ax.set_title(f"{kind.upper()} (test)"); ax.legend()
    fig.tight_layout(); fig.savefig(out_png, dpi=200); plt.show()

if ROC_DIR.exists() and (ROC_DIR/"y_test.npy").exists():
    y_test_full = np.load(ROC_DIR/"y_test.npy")
    items_full = []
    for name, label in [("probs_svm_kmer.npy","SVM_kmer"), ("probs_svm_onehot.npy","SVM_onehot"), ("probs_vqc.npy","VQC")]:
        pfile = ROC_DIR / name
        if pfile.exists():
            items_full.append((label, np.load(pfile)))

    if items_full:
        _plot_family(items_full, y_test_full, FIGS/"roc_test.png", kind="roc")
        _plot_family(items_full, y_test_full, FIGS/"pr_test.png",  kind="pr")
    else:
        print("No full-length ROC/PR cache found for classical/VQC.")

    # QSVM (aligned label file)
    if (ROC_DIR/"probs_qsvm_kernel.npy").exists() and (ROC_DIR/"y_test_qsvm.npy").exists():
        y_q = np.load(ROC_DIR/"y_test_qsvm.npy")
        probs_q = np.load(ROC_DIR/"probs_qsvm_kernel.npy")
        _plot_family([("QSVM_kernel", probs_q)], y_q, FIGS/"roc_test_qsvm.png", kind="roc")
        _plot_family([("QSVM_kernel", probs_q)], y_q, FIGS/"pr_test_qsvm.png",  kind="pr")
else:
    print("No ROC cache directory or y_test.npy.")

# --- Confusion matrices (use aligned labels when available) ---
try:
    if CM_DIR.exists() and (CM_DIR/"y_true.json").exists():
        import json as _json
        with open(CM_DIR/"y_true.json") as f:
            y_true_full = np.array(_json.load(f))
        for name,label,ytrue_override in [
            ("svm_kmer","SVM_kmer",None),
            ("svm_onehot","SVM_onehot",None),
            ("vqc","VQC",None),
            ("qsvm_kernel","QSVM_kernel","y_true_qsvm.json"),
        ]:
            yp = CM_DIR / f"y_pred_{name}.json"
            if not yp.exists(): 
                continue
            with open(yp) as f:
                y_pred = np.array(_json.load(f))
            y_true_use = y_true_full
            if isinstance(ytrue_override, str) and (CM_DIR/ytrue_override).exists():
                with open(CM_DIR/ytrue_override) as g:
                    y_true_use = np.array(_json.load(g))
            cm = confusion_matrix(y_true_use, y_pred)
            fig, ax = plt.subplots(figsize=(3.8,3.3))
            im = ax.imshow(cm)
            ax.set_title(f"Confusion Matrix — {label} (test)")
            ax.set_xlabel("Predicted"); ax.set_ylabel("True")
            for (i,j), v in np.ndenumerate(cm):
                ax.text(j, i, int(v), ha="center", va="center",
                        color=("white" if im.norm(v)>0.5 else "black"))
            fig.colorbar(im, ax=ax, shrink=0.8)
            fig.tight_layout(); fig.savefig(FIGS/f"cm_{name}.png", dpi=200); plt.show()
except Exception as e:
    print("CM render skipped:", e)

# --- Bar chart for F1 ---
if not test_all.empty:
    fig, ax = plt.subplots(figsize=(7,4))
    ax.bar(test_all["model"], test_all["f1"])
    ax.set_ylim(0, 1); ax.set_ylabel("F1 (test)"); ax.set_title("Test F1 by model")
    for i, v in enumerate(test_all["f1"].values):
        ax.text(i, v + 0.02, f"{v:.2f}", ha="center", va="bottom")
    plt.xticks(rotation=15, ha="right"); plt.tight_layout()
    fig.savefig(FIGS/"bar_f1_test.png", dpi=200); plt.show()

# --- Heatmaps for noise sweeps (if available) ---
def heatmap_from_pivot(df, name):
    if df.empty:
        print(f"No data for {name} heatmap."); return
    t = df[df["split"]=="test"][["shots","pflip","pdepol","f1"]].copy()
    if t.empty: 
        print(f"No test split in {name}."); return
    for sh in sorted(t["shots"].unique()):
        sub = t[t["shots"]==sh].copy()
        cols = sorted(sub["pdepol"].unique()); rows = sorted(sub["pflip"].unique())
        M = np.zeros((len(rows), len(cols))) + np.nan
        for i, pf in enumerate(rows):
            for j, pd in enumerate(cols):
                vals = sub[(sub["pflip"]==pf) & (sub["pdepol"]==pd)]["f1"].values
                if len(vals): M[i,j] = np.mean(vals)
        fig, ax = plt.subplots(figsize=(5.4,4.2))
        im = ax.imshow(M, aspect="auto", origin="upper")
        ax.set_xticks(range(len(cols))); ax.set_xticklabels([f"{c:.3f}" for c in cols])
        ax.set_yticks(range(len(rows))); ax.set_yticklabels([f"{r:.3f}" for r in rows])
        ax.set_xlabel("p_depol"); ax.set_ylabel("p_flip"); ax.set_title(f"{name} F1 — shots={sh}")
        for (i,j), v in np.ndenumerate(M):
            if not np.isnan(v):
                ax.text(j, i, f"{v:.2f}", ha="center", va="center", color=("white" if im.norm(v)>0.6 else "black"))
        fig.colorbar(im, ax=ax, shrink=0.8)
        fig.tight_layout(); fig.savefig(FIGS/f"heat_{name}_shots{sh}.png", dpi=200); plt.show()

heatmap_from_pivot(ns_vqc,  "VQC")
heatmap_from_pivot(ns_qsvm, "QSVM")
print("Figures ready in:", FIGS)

NameError: name 'test_all' is not defined

# Cell 3 — executive summary & markdown (adds limitations/problems)

In [None]:
# Executive summary (Markdown + file)
lines = []
lines.append("# Results Summary\n")

if not test_all.empty:
    best = test_all.iloc[0]
    base_txt = ""
    if "SVM_kmer" in test_all["model"].values:
        base = test_all.set_index("model").loc["SVM_kmer"]
        best_delta = best["f1"] - base["f1"] if not np.isnan(base["f1"]) else np.nan
        base_txt = f" vs SVM_kmer ΔF1={best_delta:+.3f}" if not np.isnan(best_delta) else ""
    lines.append(f"- **Best test F1:** {best['f1']:.3f} ({best['model']}){base_txt}; "
                 f"Acc={best['acc']:.3f}, Prec={best['prec']:.3f}, Rec={best['rec']:.3f}, "
                 f"ROC-AUC={best['auc']:.3f}, PR-AUC={best.get('pr_auc',np.nan):.3f}\n")
else:
    lines.append("- Metrics not found; run training/evaluation notebooks first.\n")

# Dataset/Encoding
if enc_meta:
    lines.append("## Dataset/Encoding\n")
    lines.append(f"- Accession: `{enc_meta.get('accession','?')}`")
    lines.append(f"- Window: {enc_meta.get('window','?')}  |  Stride: {enc_meta.get('stride','?')}")
    lines.append(f"- Total samples: {enc_meta.get('n_samples','?')}\n")

# Noise & Shots — Top configs
if not ns_vqc.empty or not ns_qsvm.empty:
    lines.append("## Noise & Shots — Top configs")
    if 'best_vqc' in globals() and not best_vqc.empty:
        lines.append("**VQC (by mean test F1):**")
        for r in best_vqc.itertuples():
            lines.append(f"- shots={r.shots}, pflip={r.pflip}, pdepol={r.pdepol} → F1={r.f1:.3f}")
    if 'best_qsvm' in globals() and not best_qsvm.empty:
        lines.append("**QSVM (by mean test F1):**")
        for r in best_qsvm.itertuples():
            extra = []
            if hasattr(r, "anchors"): extra.append(f"anchors={r.anchors}")
            if hasattr(r, "S_NOISE"): extra.append(f"S_NOISE={r.S_NOISE}")
            lines.append(f"- shots={r.shots}, pflip={r.pflip}, pdepol={r.pdepol}"
                         + (", " + ", ".join(extra) if extra else "")
                         + f" → F1={r.f1:.3f}")

# Limitations & Problems from journals
if not journal.empty:
    lines.append("\n## Limitations & Problems (from run journals)")
    for _, r in journal[journal["status"].isin(["warn","fail"])].iterrows():
        lines.append(f"- [{r['status'].upper()}:{r['step']}] {r['message']} (source: {r['source']})")
else:
    lines.append("\n## Limitations & Problems\n- No warnings or failures recorded in journals.")

# Artifacts
lines.append("\n## Artifacts")
lines.append("- `results/metrics/summary_test.csv`")
lines.append("- `results/figures/roc_test.png`, `pr_test.png`, `bar_f1_test.png`, `cm_*.png`, `heat_*_shots*.png`")
lines.append("- `results/tables/summary_test.tex`")
lines.append("- `results/report/DNA_QML_Results_Report.docx` / `.pdf`")

(REPORT / "SUMMARY.md").write_text("\n".join(lines), encoding="utf-8")
print("Wrote:", REPORT/"SUMMARY.md")
print((REPORT/"SUMMARY.md").read_text()[:800], "...")

# Cell 4 — prep artifacts for document builders

In [None]:
# Gather figure paths safely (if exist)
def _maybe(p): 
    p = Path(p)
    return str(p) if p.exists() else None

ART = {
    "ROC (test)":               _maybe(FIGS/"roc_test.png"),
    "PR (test)":                _maybe(FIGS/"pr_test.png"),
    "F1 bar (test)":            _maybe(FIGS/"bar_f1_test.png"),
    "Confusion (svm_kmer)":     _maybe(FIGS/"cm_svm_kmer.png"),
    "Confusion (svm_onehot)":   _maybe(FIGS/"cm_svm_onehot.png"),
    "Confusion (qsvm_kernel)":  _maybe(FIGS/"cm_qsvm_kernel.png"),
    "Confusion (vqc)":          _maybe(FIGS/"cm_vqc.png"),
}

# plus any heatmaps generated
for p in FIGS.glob("heat_*_shots*.png"):
    ART[p.stem] = str(p)

print("Figures collected:", len([v for v in ART.values() if v]))
list(ART.items())[:5]

# Cell 5 — safe imports + helpers for DOCX/PDF (adds richer tables)

In [None]:
# Safe import & helpers
def ensure_imports():
    """Import python-docx & reportlab safely. Returns (DocxDocument, Inches, Pt, WD_ALIGN, reportlab_ok)."""
    DocxDocument = Inches = Pt = WD_ALIGN_PARAGRAPH = None
    reportlab_ok = True
    try:
        from docx import Document as DocxDocument
        from docx.shared import Inches, Pt
        from docx.enum.text import WD_ALIGN_PARAGRAPH
        from docx.enum.section import WD_ORIENT
        from docx.oxml.ns import qn
        from docx.oxml import OxmlElement
    except Exception as e:
        try:
            import subprocess, sys as _sys
            subprocess.check_call([_sys.executable, "-m", "pip", "install", "python-docx"])
            from docx import Document as DocxDocument
            from docx.shared import Inches, Pt
            from docx.enum.text import WD_ALIGN_PARAGRAPH
            from docx.enum.section import WD_ORIENT
            from docx.oxml.ns import qn
            from docx.oxml import OxmlElement
        except Exception as ee:
            print("! python-docx not available:", ee)
            DocxDocument = Inches = Pt = WD_ALIGN_PARAGRAPH = None

    try:
        from reportlab.lib.pagesizes import A4, landscape
        from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image as RLImage, Table, TableStyle, PageBreak
        from reportlab.lib.styles import getSampleStyleSheet
        from reportlab.lib.units import inch
        from reportlab.lib import colors
    except Exception:
        reportlab_ok = False

    globals().update(locals())
    return DocxDocument, Inches, Pt, WD_ALIGN_PARAGRAPH, reportlab_ok

DocxDocument, Inches, Pt, WD_ALIGN_PARAGRAPH, REPORTLAB_OK = ensure_imports()

def _sys_info():
    import numpy, pandas, sklearn
    return {
        "Python": sys.version.split()[0],
        "Platform": f"{platform.system()} {platform.release()} ({platform.machine()})",
        "NumPy": numpy.__version__,
        "Pandas": pandas.__version__,
        "Scikit-learn": sklearn.__version__,
        "Matplotlib": plt.matplotlib.__version__,
    }

def _fmt_secs(sec):
    return f"{sec/60:.1f} min" if isinstance(sec,(int,float)) and sec >= 60 else f"{sec:.1f} s" if isinstance(sec,(int,float)) else "-"

def df_to_docx_table(doc, df, style="Light List Accent 1"):
    cols = list(df.columns)
    tbl = doc.add_table(rows=1, cols=len(cols))
    hdr = tbl.rows[0].cells
    for j, c in enumerate(cols):
        hdr[j].text = str(c)
    for _, row in df.iterrows():
        cells = tbl.add_row().cells
        for j, c in enumerate(cols):
            v = row[c]
            cells[j].text = f"{v:.3f}" if isinstance(v, float) else str(v)
    tbl.style = style
    return tbl

def add_toc_field(doc):
    """Insert a Word TOC field that updates on open (References → Update Table in Word)."""
    try:
        from docx.oxml import OxmlElement
        from docx.oxml.ns import qn
        p = doc.add_paragraph()
        r = p.add_run()
        fld = OxmlElement('w:fldSimple')
        fld.set(qn('w:instr'), 'TOC \\o "1-3" \\h \\z \\u')
        r._r.append(fld)
    except Exception as e:
        doc.add_paragraph("(TOC field could not be inserted automatically)")

def add_section_landscape(doc):
    from docx.enum.section import WD_ORIENT
    section = doc.add_section()
    section.orientation = WD_ORIENT.LANDSCAPE
    new_w, new_h = section.page_height, section.page_width
    section.page_width, section.page_height = new_w, new_h
    return section

# Cell 6 — build advanced DOCX + PDF and preview

In [None]:
from datetime import datetime

DOCX_PATH = REPORT / "DNA_QML_Results_Report.docx"
PDF_PATH  = REPORT / "DNA_QML_Results_Report.pdf"

if DocxDocument is None:
    raise RuntimeError("python-docx not available. Please install python-docx and re-run this cell.")

doc = DocxDocument()

# ----- Cover -----
title = doc.add_heading("DNA QML — Results Report", level=0)
title.alignment = 1
doc.add_paragraph(datetime.now().strftime("%Y-%m-%d %H:%M")).alignment = 1
envp = doc.add_paragraph()
for k, v in _sys_info().items():
    envp.add_run(f"{k}: {v}\n").font.size = Pt(10)
doc.add_page_break()

# ----- TOC -----
doc.add_heading("Table of Contents", level=1)
add_toc_field(doc)
doc.add_page_break()

# ----- Methods & Data -----
doc.add_heading("Methods & Data", level=1)
doc.add_paragraph("We evaluate classical and quantum classifiers on PCA-reduced k-mer and one-hot encodings. "
                  "Quantum models use an angle-embedding circuit with ring entanglement. "
                  "Evaluation uses accuracy, precision, recall, F1, ROC-AUC, PR-AUC, specificity, balanced accuracy, and MCC.")
if enc_meta:
    doc.add_paragraph(f"Dataset: accession={enc_meta.get('accession','?')}, window={enc_meta.get('window','?')}, stride={enc_meta.get('stride','?')}, N={enc_meta.get('n_samples','?')}.")

# ----- Models -----
doc.add_heading("Models", level=2)
doc.add_paragraph("- SVM_kmer (RBF) on standardized k-mer features.\n"
                  "- SVM_onehot (RBF) on flattened one-hot features.\n"
                  "- QSVM_kernel (precomputed quantum kernel with SVC(kernel='precomputed')).\n"
                  "- VQC (variational circuit classifier with Pauli-Z expectation).")

# ----- Executive Summary -----
doc.add_heading("Executive Summary", level=1)
if not test_all.empty:
    best = test_all.iloc[0]
    base_txt = ""
    if "SVM_kmer" in test_all["model"].values:
        base = test_all.set_index("model").loc["SVM_kmer"]
        if not np.isnan(base["f1"]):
            base_txt = f" (ΔF1 vs SVM_kmer: {best['f1']-base['f1']:+.3f})"
    doc.add_paragraph(f"Best model: {best['model']} with F1={best['f1']:.3f}{base_txt}; "
                      f"Acc={best['acc']:.3f}, Prec={best['prec']:.3f}, Rec={best['rec']:.3f}, "
                      f"AUC={best['auc']:.3f}, PR-AUC={best.get('pr_auc',np.nan):.3f}.")
else:
    doc.add_paragraph("Metrics not found; ensure training/evaluation notebooks were executed.")

# ----- Test Metrics Table (richer) -----
doc.add_heading("Test Metrics (one per model)", level=1)
cols_main = [c for c in ["model","acc","prec","rec","f1","auc","pr_auc","specificity","balanced_acc","mcc"] if c in test_all.columns]
df_to_docx_table(doc, test_all[cols_main])

# ----- Pairwise Statistical Comparison (McNemar) -----
if mcnemar_stats:
    doc.add_heading("Pairwise Comparison (McNemar’s test)", level=1)
    rows = []
    for k, v in mcnemar_stats.items():
        rows.append({"comparison":k, **v})
    df_mc = pd.DataFrame(rows)[["comparison","n01","n10","chi2","p_approx"]]
    df_to_docx_table(doc, df_mc)
else:
    doc.add_paragraph("No McNemar statistics file found.")

# ----- Noise/Shot Top Configs -----
if (not ns_vqc.empty) or (not ns_qsvm.empty):
    doc.add_heading("Noise & Shots — Top Configurations", level=1)
    if 'best_vqc' in globals() and not best_vqc.empty:
        doc.add_paragraph("VQC — top (mean test F1)", style="Intense Quote")
        df_to_docx_table(doc, best_vqc)
    if 'best_qsvm' in globals() and not best_qsvm.empty:
        doc.add_paragraph("QSVM — top (mean test F1)", style="Intense Quote")
        df_to_docx_table(doc, best_qsvm)

# ----- Figures -----
doc.add_heading("Figures", level=1)
for label, path in ART.items():
    if not path: continue
    doc.add_paragraph(label).alignment = 1
    try:
        doc.add_picture(path, width=Inches(6))
    except Exception:
        doc.add_paragraph(f"(could not embed: {path})")
    cap = doc.add_paragraph(f"Figure: {label}")
    cap.alignment = 1

# ----- Limitations & Problems (from journals) -----
doc.add_heading("Limitations & Problems", level=1)
if not journal.empty:
    # show only WARN/FAIL, grouped by source
    jsub = journal[journal["status"].isin(["warn","fail"])].copy()
    if not jsub.empty:
        add_section_landscape(doc)
        grp = jsub.groupby("source")
        for src, chunk in grp:
            doc.add_paragraph(f"Source: {src}", style="Intense Quote")
            df_to_docx_table(doc, chunk[["ts","status","step","message"]].reset_index(drop=True), style="Colorful List")
    else:
        doc.add_paragraph("No warnings or failures recorded.")
else:
    doc.add_paragraph("No journal logs found. If journaling was disabled, this section stays minimal.")

# ----- Timing Summary -----
if timing:
    doc.add_heading("Timing Summary", level=1)
    for block_name in ["VQC","QSVM","VQC_actual","QSVM_actual"]:
        dd = timing.get(block_name, {})
        if not dd: continue
        doc.add_paragraph(block_name, style="Intense Quote")
        for k, v in dd.items():
            doc.add_paragraph(f"- {k}: {v}")

# ----- Appendix: raw tables -----
doc.add_heading("Appendix — Raw Tables", level=1)
try:
    if not combined.empty:
        doc.add_paragraph("Combined (test rows)")
        df_to_docx_table(doc, pick_test(combined)[cols_main], style="Light Shading Accent 1")
    if not m_qsvm.empty:
        doc.add_paragraph("QSVM_kernel (all splits)")
        keep_cols = [c for c in ["split","acc","prec","rec","f1","auc","pr_auc","specificity","balanced_acc","mcc","thr"] if c in m_qsvm.columns]
        df_to_docx_table(doc, m_qsvm[["split"]+keep_cols[1:]].head(30))
    if not ns_vqc.empty:
        doc.add_paragraph("Noise Sweep — VQC (sample)")
        df_to_docx_table(doc, ns_vqc.head(30))
    if not ns_qsvm.empty:
        doc.add_paragraph("Noise Sweep — QSVM (sample)")
        df_to_docx_table(doc, ns_qsvm.head(30))
except Exception:
    pass

# ----- Save DOCX -----
doc.save(str(DOCX_PATH))

# ----- PDF (ReportLab preferred; fallback to Matplotlib multipage) -----
if REPORTLAB_OK:
    from reportlab.lib.pagesizes import A4
    from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image as RLImage, Table, TableStyle, PageBreak
    from reportlab.lib.styles import getSampleStyleSheet
    from reportlab.lib.units import inch
    from reportlab.lib import colors

    styles = getSampleStyleSheet()
    h1, h2, body = styles["Title"], styles["Heading2"], styles["BodyText"]
    story = []

    story.append(Paragraph("DNA QML — Results Report", h1))
    story.append(Paragraph(datetime.now().strftime("%Y-%m-%d %H:%M"), body))
    story.append(Spacer(1, 0.2*inch))

    # Summary
    story.append(Paragraph("Executive Summary", h2))
    if not test_all.empty:
        best = test_all.iloc[0]
        base_txt = ""
        if "SVM_kmer" in test_all["model"].values:
            base = test_all.set_index("model").loc["SVM_kmer"]
            if not np.isnan(base["f1"]): base_txt = f" (ΔF1 vs SVM_kmer: {best['f1']-base['f1']:+.3f})"
        story.append(Paragraph(f"Best model: {best['model']} with F1={best['f1']:.3f}{base_txt}; "
                               f"Acc={best['acc']:.3f}, Prec={best['prec']:.3f}, Rec={best['rec']:.3f}, "
                               f"AUC={best['auc']:.3f}, PR-AUC={best.get('pr_auc',np.nan):.3f}.", body))
    if enc_meta:
        story.append(Paragraph(f"Accession: {enc_meta.get('accession','?')} — Window={enc_meta.get('window','?')}, "
                               f"Stride={enc_meta.get('stride','?')}, N={enc_meta.get('n_samples','?')}.", body))
    story.append(Spacer(1, 0.15*inch))

    # Test Metrics table
    story.append(Paragraph("Test Metrics (one per model)", h2))
    data = [cols_main] + [[f"{row[c]:.3f}" if isinstance(row[c], float) else str(row[c]) for c in cols_main] for _, row in test_all.iterrows()]
    tbl = Table(data, hAlign="LEFT")
    tbl.setStyle(TableStyle([
        ("BACKGROUND", (0,0), (-1,0), colors.HexColor("#e8eef7")),
        ("GRID", (0,0), (-1,-1), 0.4, colors.grey),
        ("FONTNAME", (0,0), (-1,0), "Helvetica-Bold"),
        ("ALIGN", (1,1), (-1,-1), "CENTER"),
    ]))
    story.append(tbl)
    story.append(Spacer(1, 0.2*inch))

    # Timing (if any)
    if timing:
        story.append(Paragraph("Timing Summary", h2))
        for block_name in ["VQC","QSVM","VQC_actual","QSVM_actual"]:
            dd = timing.get(block_name, {})
            if not dd: continue
            story.append(Paragraph(f"<b>{block_name}</b>", body))
            for k, v in dd.items():
                story.append(Paragraph(f"{k}: {v}", body))
        story.append(Spacer(1, 0.2*inch))

    # Figures
    story.append(Paragraph("Figures", h2))
    for label, path in ART.items():
        if not path: continue
        story.append(Paragraph(label, body))
        try:
            story.append(RLImage(path, width=6*inch))
        except Exception:
            story.append(Paragraph(f"(could not embed: {path})", body))
        story.append(Spacer(1, 0.15*inch))

    SimpleDocTemplate(str(PDF_PATH), pagesize=A4).build(story)
else:
    # Fallback: simple multipage PDF
    from matplotlib.backends.backend_pdf import PdfPages
    with PdfPages(str(PDF_PATH)) as pdf:
        # Page 1: summary text
        fig, ax = plt.subplots(figsize=(8.27, 11.69))
        ax.axis("off")
        lines = ["DNA QML — Results Report", "", f"Generated: {datetime.now():%Y-%m-%d %H:%M}"]
        for k, v in _sys_info().items():
            lines.append(f"{k}: {v}")
        lines.append("")
        if not test_all.empty:
            lines.append("Test Metrics (top rows):")
            lines.append(test_all[cols_main].to_string(index=False))
        ax.text(0.02, 0.98, "\n".join(lines), va="top", ha="left", fontsize=10, family="monospace")
        pdf.savefig(fig); plt.close(fig)

        # Add figures pages
        for label, path in ART.items():
            if not path: continue
            img = plt.imread(path)
            fig, ax = plt.subplots(figsize=(8.27, 11.69))
            ax.imshow(img); ax.axis("off")
            ax.set_title(label)
            pdf.savefig(fig); plt.close(fig)

print("Saved:")
print(" -", DOCX_PATH)
print(" -", PDF_PATH)

# ---- Preview in notebook ----
from IPython.display import display, Image as IPyImage
print("\nPreview — Test Metrics (DataFrame):")
display(test_all)

for name in ["bar_f1_test.png","roc_test.png","pr_test.png"]:
    p = FIGS/name
    if p.exists():
        display(IPyImage(filename=str(p)))