# Dataset Collector — Run All Pipelines

This notebook orchestrates every `*_pipeline_v2` directory in this repository.

* It expects to run inside the conda environment named **`Dataset_Collector`**.
* It prompts for API keys (for example `GITHUB_TOKEN`, `CHEMSPIDER_API_KEY`) when missing,
  and sets them as environment variables for the current session.
* It can optionally install each pipeline's requirements before running stages.


**Set `DEST_ROOT`** to your dataset output directory before running any pipeline stages.

In [None]:
!python tools/validate_repo.py --root .


In [None]:
import os
from pathlib import Path

NOTEBOOK_NAME = 'dataset_collector_run_all_pipelines.ipynb'

def locate_repo_root(start: Path) -> Path:
    if (start / NOTEBOOK_NAME).exists():
        return start
    for parent in start.parents:
        if (parent / NOTEBOOK_NAME).exists():
            return parent
    for child in start.iterdir():
        if child.is_dir() and (child / NOTEBOOK_NAME).exists():
            return child
    return start

repo_root = locate_repo_root(Path.cwd())
if repo_root != Path.cwd():
    os.chdir(repo_root)
    print(f'Changed working directory to repo root: {repo_root}')
elif not (repo_root / NOTEBOOK_NAME).exists():
    print(
        'WARNING: Could not locate notebook in current or nearby directories. '
        'If pipelines are not detected, set repo_root manually.'
    )

print(f'Working directory: {repo_root}')
conda_env = os.environ.get('CONDA_DEFAULT_ENV')
if conda_env != 'Dataset_Collector':
    print(
        'WARNING: Expected conda env Dataset_Collector, '
        f'but CONDA_DEFAULT_ENV={conda_env!r}.\n',
        'Activate the correct env before proceeding.'
    )
else:
    print('Conda env looks correct: Dataset_Collector')


In [None]:
pipeline_dirs = sorted([p for p in repo_root.iterdir() if p.is_dir() and p.name.endswith('_pipeline_v2')])
print('Detected pipelines:')
for pipeline in pipeline_dirs:
    print(f'  - {pipeline.name}')


In [None]:
import getpass

def ensure_env(var_name: str, prompt: str) -> None:
    current = os.environ.get(var_name, '').strip()
    if current:
        print(f'{var_name} already set (length={len(current)})')
        return
    value = getpass.getpass(f'{prompt} (leave blank to skip): ')
    if value:
        os.environ[var_name] = value
        print(f'Set {var_name} for this session.')
    else:
        print(f'Skipped {var_name}. Some pipeline sources may fail or rate-limit.')

required_env_prompts = {
    'GITHUB_TOKEN': 'Enter a GitHub token for higher rate limits',
    'CHEMSPIDER_API_KEY': 'Enter a ChemSpider API key (chemistry pipeline)',
}

for var_name, prompt in required_env_prompts.items():
    ensure_env(var_name, prompt)


In [None]:
import subprocess
import sys

# Set to True to install all pipeline requirements (recommended on first run).
INSTALL_REQUIREMENTS = False

if INSTALL_REQUIREMENTS:
    for pipeline in pipeline_dirs:
        requirements = pipeline / 'requirements.txt'
        if requirements.exists():
            print(f'Installing requirements for {pipeline.name}...')
            subprocess.run(
                [sys.executable, '-m', 'pip', 'install', '-r', str(requirements)],
                check=True,
            )
        else:
            print(f'No requirements.txt found for {pipeline.name}, skipping.')
else:
    print('Skipping requirements install. Set INSTALL_REQUIREMENTS=True to enable.')


In [None]:
import subprocess
import sys
from pathlib import Path

# Windows-first runner (recommended)
# Uses tools/build_natural_corpus.py to orchestrate pipelines without bash.
DEST_ROOT = r"E:\AI-Research\datasets\Natural"  # Update to your dataset root.
EXECUTE = False  # True = downloads/writes; False = plan-only.
WORKERS = 6  # acquisition parallelism
MODE = "collect"  # collect | compile | full

repo_root = Path(repo_root)
validator = [sys.executable, str(repo_root / "tools" / "validate_repo.py")]
print(" ".join(validator))
subprocess.run(validator, check=True, cwd=str(repo_root))

cmd = [
    sys.executable, str(repo_root / "tools" / "build_natural_corpus.py"),
    "--dest-root", DEST_ROOT,
    "--workers", str(WORKERS),
    "--mode", MODE,
]
if EXECUTE:
    cmd.append("--execute")

print(" ".join(cmd))
subprocess.run(cmd, check=True, cwd=str(repo_root))


## Manual YELLOW review (required before screening)

1. Run the **collect** stages (classify/acquire).
2. Review the YELLOW queue and approve or reject targets:
   - `python <pipeline>/review_queue.py list --queue <dataset_root>/_queues/yellow_pipeline.jsonl`
   - `python <pipeline>/review_queue.py approve --target <target_id> --manifest-dir <dataset_root>/_manifests/<target_id>`
   - `python <pipeline>/review_queue.py reject --target <target_id> --manifest-dir <dataset_root>/_manifests/<target_id>`
3. Re-run the **compile** stages (screen/merge/catalog).


## Appendix: WSL/Git Bash runner (optional)

Use this only if you prefer bash-based per-pipeline execution.

In [None]:
import os
import subprocess
from pathlib import Path
import yaml

# Advanced: bash-based per-pipeline runner (WSL/Git Bash required)
STAGES = [
    "classify",
    "acquire_green",
    "acquire_yellow",
    "screen_yellow",
    "merge",
    "catalog",
]
EXECUTE = False  # Set True to perform writes; False = dry-run.

repo_root = Path(repo_root)

pipeline_map_path = repo_root / "tools" / "pipeline_map.yaml"
pipeline_map = yaml.safe_load(pipeline_map_path.read_text(encoding="utf-8"))
targets_for = {k: v["targets_yaml"] for k, v in (pipeline_map.get("pipelines") or {}).items()}

env = os.environ.copy()

for pipeline in pipeline_dirs:
    run_script = pipeline / "run_pipeline.sh"
    if not run_script.exists():
        print(f"Skipping {pipeline.name}: run_pipeline.sh not found.")
        continue

    targets_name = targets_for.get(pipeline.name)
    if not targets_name:
        raise RuntimeError(f"No targets_yaml entry for {pipeline.name} in tools/pipeline_map.yaml")
    targets_path = pipeline / targets_name
    if not targets_path.exists():
        raise RuntimeError(f"Targets YAML not found for {pipeline.name}: {targets_path}")

    print(f"\n=== Running {pipeline.name} ({'EXECUTE' if EXECUTE else 'DRY'}) ===")
    for stage in STAGES:
        cmd = ["bash", str(run_script), "--targets", str(targets_path), "--stage", stage]
        if EXECUTE:
            cmd.append("--execute")
        print(" ".join(cmd))
        subprocess.run(cmd, check=True, env=env, cwd=pipeline)
