# 08_appendix_reproducibility.ipynb — environment/seeds/audit

# Cell 0 — perf env

In [1]:
# (same perf env)
import os
os.environ.setdefault("OMP_NUM_THREADS", "8")
os.environ.setdefault("OPENBLAS_NUM_THREADS", "8")
os.environ.setdefault("MKL_NUM_THREADS", "8")
os.environ.setdefault("NUMEXPR_NUM_THREADS", "8")

'8'

# Cell 1 — environment snapshot

In [2]:
from pathlib import Path
import json, platform, sys, warnings, numpy as np, pandas as pd

warnings.filterwarnings("ignore")
ROOT = Path("."); PROCESSED = ROOT/"data/processed"; RESULTS = ROOT/"results"; APPX = RESULTS/"appendix"
APPX.mkdir(parents=True, exist_ok=True)

def pkg_version(pkg):
    try:
        mod = __import__(pkg)
        return getattr(mod, "__version__", "unknown")
    except Exception:
        return "not_installed"

env = {
  "python": sys.version.split()[0],
  "platform": f"{platform.system()} {platform.release()}",
  "packages": {
    "numpy": pkg_version("numpy"),
    "pandas": pkg_version("pandas"),
    "scikit_learn": pkg_version("sklearn"),
    "biopython": pkg_version("Bio"),
    "matplotlib": pkg_version("matplotlib"),
    "pennylane": pkg_version("pennylane"),
  },
  "notes": "Notebook-only workflow; seeds and splits persisted in data/processed."
}
with open(APPX/"environment.json","w") as f: json.dump(env, f, indent=2)
env

{'python': '3.10.13',
 'platform': 'Windows 10',
 'packages': {'numpy': '2.1.3',
  'pandas': '2.3.1',
  'scikit_learn': '1.7.1',
  'biopython': '1.85',
  'matplotlib': '3.10.0',
  'pennylane': '0.42.1'},
 'notes': 'Notebook-only workflow; seeds and splits persisted in data/processed.'}

# Cell 2 — dataset audit

In [3]:
enc = np.load(PROCESSED/"encodings.npz", allow_pickle=True)
with open(PROCESSED/"splits.json") as f: SPL = json.load(f)
y = enc["y"].astype(int); W = int(enc["window"]); STRIDE = int(enc["stride"]) if "stride" in enc else None
audit = {"n_samples": int(len(y)), "positives(CDS)": int(y.sum()), "negatives(RNA)": int((y==0).sum()),
         "window_length": W, "stride": STRIDE, "splits": {k: len(v) for k,v in SPL.items()}}
with open(APPX/"dataset_audit.json","w") as f: json.dump(audit, f, indent=2)
pd.DataFrame([audit])

Unnamed: 0,n_samples,positives(CDS),negatives(RNA),window_length,stride,splits
0,1490,1375,115,256,128,"{'train': 894, 'val': 298, 'test': 298}"


# Cell 3 — seeds & README

In [4]:
seeds = {"split_train_test": 42, "split_train_val": 43, "pca_kernels": 7, "pca_vqc": 11, "noise_sweeps": 17, "analysis": 23}
with open(APPX/"seeds.json","w") as f: json.dump(seeds, f, indent=2)
seeds

md = []
md.append("# Appendix — Reproducibility Bundle\n")
md.append("This folder contains lightweight artifacts to aid auditing and re-running.\n")
md.append("## Files\n- `environment.json`\n- `dataset_audit.json`\n- `seeds.json`\n")
md.append("## How to Reproduce\n")
md.append("1. Run notebooks in order: 01 → 02 → 03 → 04 → 05 → 06 → 07 → 08.\n")
md.append("2. Ensure `data/processed/encodings.npz` and `splits.json` exist after Step 01.\n")
md.append("3. Metrics and figures appear under `results/metrics/` and `results/figures/`.\n")
(APPX/"README.md").write_text("\n".join(md), encoding="utf-8")
print("Wrote:", APPX/"README.md")

Wrote: results\appendix\README.md


# seed register (document the ones you used)

In [5]:
# Update this list if you use more seeds in other notebooks
seeds = {
  "split_train_test": 42,
  "split_train_val": 43,
  "pca_kernels": 7,
  "pca_vqc": 11,
  "noise_sweeps": 17,
  "analysis": 23
}
with open(APPX/"seeds.json","w") as f:
    json.dump(seeds, f, indent=2)
seeds

{'split_train_test': 42,
 'split_train_val': 43,
 'pca_kernels': 7,
 'pca_vqc': 11,
 'noise_sweeps': 17,
 'analysis': 23}

# write an Appendix README (markdown)

In [6]:
md = []
md.append("# Appendix — Reproducibility Bundle\n")
md.append("This folder contains lightweight, human-readable artifacts to aid auditing and re-running.")
md.append("\n## Files\n")
md.append("- `environment.json` — Python/platform and key package versions.")
md.append("- `dataset_audit.json` — counts, window config, and split sizes.")
md.append("- `seeds.json` — random seeds used across notebooks.")
md.append("\n## How to Reproduce (quick)\n")
md.append("1. Recreate the Conda env; open notebooks in order: `01_data_preparation → 02_classical_baselines → 03_quantum_kernel → 04_quantum_vqc → 05_noise_robustness → 06_benchmark_analysis → 07_reporting → 08_appendix_reproducibility`.\n")
md.append("2. Ensure `data/processed/encodings.npz` and `splits.json` are present from Step 01.\n")
md.append("3. Metrics and figures will appear under `results/metrics/` and `results/figures/`.\n")
(APPX/"README.md").write_text("\n".join(md), encoding="utf-8")
print("Wrote:", APPX/"README.md")
print((APPX/"README.md").read_text()[:400], "...")

Wrote: results\appendix\README.md
# Appendix — Reproducibility Bundle

This folder contains lightweight, human-readable artifacts to aid auditing and re-running.

## Files

- `environment.json` — Python/platform and key package versions.
- `dataset_audit.json` — counts, window config, and split sizes.
- `seeds.json` — random seeds used across notebooks.

## How to Reproduce (quick)

1. Recreate the Conda env; open notebooks in ord ...
