# Run Synthetic Dataset Pipeline
This notebook is a thin runner around the command-line orchestrator in scripts/run.py.

It only wires parameters and invokes the pipeline for inspection; all business logic lives in the engine and scripts modules.

In [None]:
# Configure dataset and optional fixed run id
dataset = "finance_transactions"  # must match a directory in ../datasets/
run_id = None  # or set to a fixed version string like "2025-01-01T00-00-00Z"

In [None]:
# Invoke the orchestrator for the configured dataset
import subprocess
import sys
from pathlib import Path

project_root = Path("..").resolve()
script = project_root / "scripts" / "run.py"

cmd = [sys.executable, str(script), "--dataset", dataset]
if run_id is not None:
    cmd.extend(["--run-id", str(run_id)])

result = subprocess.run(cmd, cwd=project_root, capture_output=True, text=True)
print("Return code:", result.returncode)
print("STDOUT:\n", result.stdout)
print("STDERR:\n", result.stderr)

In [None]:
# Inspect latest run directory for the configured dataset
import json
import os

runs_root = project_root / "runs" / dataset
if runs_root.exists():
    versions = sorted(p.name for p in runs_root.iterdir() if p.is_dir())
    if versions:
        latest = versions[-1]
        latest_dir = runs_root / latest
        print("Latest run dir:", latest_dir)
        for fname in ["configs_snapshot.json", "run_metadata.json", "validation_report.json", "evaluation_report.json", "final_metadata.json"]:
            path = latest_dir / fname
            print(f"{fname}:", "OK" if path.exists() else "MISSING")
    else:
        print("No run versions found for dataset", dataset)
else:
    print("No runs directory found for dataset", dataset)