# Dataset Collector — Run All Pipelines

This notebook orchestrates every `*_pipeline_v2` directory in this repository.

* It expects to run inside the conda environment named **`Dataset_Collector`**.
* It prompts for API keys (for example `GITHUB_TOKEN`, `CHEMSPIDER_API_KEY`) when missing,
  and sets them as environment variables for the current session.
* It can optionally install each pipeline's requirements before running stages.


In [None]:
import os
from pathlib import Path

NOTEBOOK_NAME = 'dataset_collector_run_all_pipelines.ipynb'

def locate_repo_root(start: Path) -> Path:
    if (start / NOTEBOOK_NAME).exists():
        return start
    for parent in start.parents:
        if (parent / NOTEBOOK_NAME).exists():
            return parent
    for child in start.iterdir():
        if child.is_dir() and (child / NOTEBOOK_NAME).exists():
            return child
    return start

repo_root = locate_repo_root(Path.cwd())
if repo_root != Path.cwd():
    os.chdir(repo_root)
    print(f'Changed working directory to repo root: {repo_root}')
elif not (repo_root / NOTEBOOK_NAME).exists():
    print(
        'WARNING: Could not locate notebook in current or nearby directories. '
        'If pipelines are not detected, set repo_root manually.'
    )

print(f'Working directory: {repo_root}')
conda_env = os.environ.get('CONDA_DEFAULT_ENV')
if conda_env != 'Dataset_Collector':
    print(
        'WARNING: Expected conda env Dataset_Collector, '
        f'but CONDA_DEFAULT_ENV={conda_env!r}.\n',
        'Activate the correct env before proceeding.'
    )
else:
    print('Conda env looks correct: Dataset_Collector')


In [None]:
pipeline_dirs = sorted([p for p in repo_root.iterdir() if p.is_dir() and p.name.endswith('_pipeline_v2')])
print('Detected pipelines:')
for pipeline in pipeline_dirs:
    print(f'  - {pipeline.name}')


In [None]:
import os

FIX_PERMISSIONS = False  # Set True to chmod +x missing run_pipeline.sh permissions.

missing_scripts = []
non_executable = []
for pipeline in pipeline_dirs:
    run_script = pipeline / 'run_pipeline.sh'
    if not run_script.exists():
        missing_scripts.append(pipeline.name)
        continue
    if not os.access(run_script, os.X_OK):
        non_executable.append(run_script)

if missing_scripts:
    print('Pipelines missing run_pipeline.sh:', ', '.join(missing_scripts))
else:
    print('All pipelines have run_pipeline.sh.')

if non_executable:
    print('Non-executable run_pipeline.sh found:')
    for script in non_executable:
        print(f'  - {script}')
    if FIX_PERMISSIONS:
        for script in non_executable:
            script.chmod(script.stat().st_mode | 0o111)
        print('Updated permissions for run_pipeline.sh scripts.')
    else:
        print('Set FIX_PERMISSIONS=True to chmod +x.')
else:
    print('All run_pipeline.sh scripts are executable.')


In [None]:
import getpass

def ensure_env(var_name: str, prompt: str) -> None:
    current = os.environ.get(var_name, '').strip()
    if current:
        print(f'{var_name} already set (length={len(current)})')
        return
    value = getpass.getpass(f'{prompt} (leave blank to skip): ')
    if value:
        os.environ[var_name] = value
        print(f'Set {var_name} for this session.')
    else:
        print(f'Skipped {var_name}. Some pipeline sources may fail or rate-limit.')

required_env_prompts = {
    'GITHUB_TOKEN': 'Enter a GitHub token for higher rate limits',
    'CHEMSPIDER_API_KEY': 'Enter a ChemSpider API key (chemistry pipeline)',
}

for var_name, prompt in required_env_prompts.items():
    ensure_env(var_name, prompt)


In [None]:
import subprocess
import sys

INSTALL_REQUIREMENTS = False  # Set to True to install all pipeline requirements.

if INSTALL_REQUIREMENTS:
    for pipeline in pipeline_dirs:
        requirements = pipeline / 'requirements.txt'
        if requirements.exists():
            print(f'Installing requirements for {pipeline.name}...')
            subprocess.run(
                [sys.executable, '-m', 'pip', 'install', '-r', str(requirements)],
                check=True,
            )
        else:
            print(f'No requirements.txt found for {pipeline.name}, skipping.')
else:
    print('Skipping requirements install. Set INSTALL_REQUIREMENTS=True to enable.')


In [None]:
import os
import subprocess

STAGES = [
    'classify',
    'acquire_green',
    'acquire_yellow',
    'screen_yellow',
    'merge',
    'difficulty',
    'catalog',
]
EXECUTE = False  # Set True to perform writes; False = dry-run.

mode_flag = '--execute' if EXECUTE else '--dry-run'
env = os.environ.copy()

for pipeline in pipeline_dirs:
    run_script = pipeline / 'run_pipeline.sh'
    if not run_script.exists():
        print(f'Skipping {pipeline.name}: run_pipeline.sh not found.')
        continue
    print(f'\n=== Running {pipeline.name} ({mode_flag}) ===')
    for stage in STAGES:
        cmd = ['bash', str(run_script), '--stage', stage, mode_flag]
        print(' '.join(cmd))
        subprocess.run(cmd, check=True, env=env, cwd=pipeline)
