# 07_appendix_reproducibility.ipynb — environment/seeds/audit

# Cell 0 — perf env

In [7]:
# Load reproducibility artifacts (dataset audit, environment, seeds) paths
from pathlib import Path
import json, textwrap, platform, sys, importlib, datetime as dt
base = Path('results/appendix')
audit_path = base / 'dataset_audit.json'
seeds_path = base / 'seeds.json'
env_path = base / 'environment.json'
audit_path, seeds_path, env_path

(WindowsPath('results/appendix/dataset_audit.json'),
 WindowsPath('results/appendix/seeds.json'),
 WindowsPath('results/appendix/environment.json'))

# Cell 1 — environment snapshot

In [8]:
# Helper to pretty print JSON with controlled width for markdown embedding
def show_json(path, width=92):
    with open(path) as f:
        data = json.load(f)
    txt = json.dumps(data, indent=2, sort_keys=True)
    # Wrap only very long lines (rare) for readability in narrow renderers
    wrapped = []
    for line in txt.splitlines():
        if len(line) > width:
            wrapped.extend(textwrap.wrap(line, width=width, subsequent_indent='  '))
        else:
            wrapped.append(line)
    return '\n'.join(wrapped)

print('Audit snippet:')
print('\n'.join(show_json(audit_path).splitlines()[:12]))

Audit snippet:
{
  "n_samples": 1490,
  "negatives(RNA)": 115,
  "positives(CDS)": 1375,
  "splits": {
    "test": 298,
    "train": 894,
    "val": 298
  },
  "stride": 128,
  "window_length": 256
}


# Cell 2 — dataset audit

In [9]:
# Display full dataset audit JSON (structured metadata about samples / splits)
show_json(audit_path)

'{\n  "n_samples": 1490,\n  "negatives(RNA)": 115,\n  "positives(CDS)": 1375,\n  "splits": {\n    "test": 298,\n    "train": 894,\n    "val": 298\n  },\n  "stride": 128,\n  "window_length": 256\n}'

# Cell 3 — seeds & README

In [10]:
# Inspect environment capture: Python version, platform, installed key packages
with open(env_path) as f: env = json.load(f)
print('Captured timestamp:', env.get('timestamp'))
print('Platform:', env.get('platform'))
print('Python:', env.get('python_version'))
print('\nPackages (subset):')
for name, ver in list(env.get('packages', {}).items())[:18]:
    print(f"  {name}=={ver}")

Captured timestamp: None
Platform: Windows 10
Python: None

Packages (subset):
  numpy==2.1.3
  pandas==2.3.1
  scikit_learn==1.7.1
  biopython==1.85
  matplotlib==3.10.0
  pennylane==0.42.1


# seed register (document the ones you used)

In [11]:
# Show seed configuration used across experiments ensuring reproducible splits / model init
with open(seeds_path) as f:
    seeds = json.load(f)
seeds

{'split_train_test': 42,
 'split_train_val': 43,
 'pca_kernels': 7,
 'pca_vqc': 11,
 'noise_sweeps': 17,
 'analysis': 23}

# write an Appendix README (markdown)

In [12]:
# Programmatic environment verification: dynamically import recorded packages and report versions
missing = []
version_mismatch = []
recorded = env.get('packages', {})
for pkg, rec_ver in list(recorded.items())[:40]:  # limit to first 40 to keep output manageable
    try:
        mod = importlib.import_module(pkg)
        got_ver = getattr(mod, '__version__', 'unknown')
        if rec_ver and got_ver != rec_ver:
            version_mismatch.append((pkg, rec_ver, got_ver))
    except Exception:
        missing.append(pkg)

print('Missing packages:', missing if missing else 'None')
print('Version mismatches:')
for pkg, exp, got in version_mismatch[:15]:
    print(f'  {pkg}: recorded={exp} current={got}')
print('\nChecked', min(40, len(recorded)), 'packages (truncated).')

Missing packages: ['scikit_learn', 'biopython']
Version mismatches:

Checked 6 packages (truncated).
