# Update Script
This notebook orchestrates data downloads and analysis refreshes.

In [9]:
# ========== Bootstrap: ensure required Python packages are present ==========
import importlib, subprocess, sys

def _ensure(pkg_name: str, import_name: str | None = None):
    """
    Import `import_name` (defaults to `pkg_name`); if that fails, pip‑install.
    """
    try:
        importlib.import_module(import_name or pkg_name)
    except ModuleNotFoundError:
        print(f"Package '{pkg_name}' not found — installing …")
        subprocess.check_call([sys.executable, "-m", "pip", "install", pkg_name])
    finally:
        globals()[import_name or pkg_name] = importlib.import_module(import_name or pkg_name)

# --- Required third‑party libraries ------------------------------------------
_ensure("pandas")
_ensure("requests")
print("All dependencies ready.\n")

# --- Standard imports --------------------------------------------------------
from pathlib import Path
import datetime as dt
import os, re, shutil, json
import pandas as pd, requests, urllib.parse

# --- Helper: replace [date %Y-%m-%d] tokens -----------------------------------
def substitute_date_tokens(url: str) -> str:
    def _replace(m):
        fmt = m.group(1).strip()
        return dt.date.today().strftime(fmt)
    return re.sub(r"\[date\s+([^\]]+)\]", _replace, url)

# --- Helper: append API key if specified -----------------------------------
def add_apikey(url: str, env_var: str | None) -> str:
    if env_var:
        key = os.getenv(env_var)
        if key:
            sep = '&' if '?' in url else '?'
            return f'{url}{sep}api_key={urllib.parse.quote_plus(key)}'
        else:
            print(f"Warning: environment variable '{env_var}' not set.")
    return url

# --- Cadence map (word → minimum days between fetches) ------------------------
CADENCE_DAYS = {
    "daily": 1,
    "weekly": 7,
    "monthly": 30,
    "quarterly": 90,
}

# --- Resolve base directory so notebook works from repo root or data folder ---
BASE_DIR = Path.cwd() if Path('catalog.csv').exists() else Path.cwd() / 'data'

# --- Load catalog -------------------------------------------------------------
catalog_path = BASE_DIR / 'catalog.csv'
cat = pd.read_csv(catalog_path)
cat['filetype'] = cat['filetype'].astype(str).str.strip().str.lstrip('.')

today = dt.date.today()
updated_rows = []                # remember which rows we refresh

for idx, row in cat.iterrows():
    folder = BASE_DIR / str(row['folder'])
    folder.mkdir(parents=True, exist_ok=True)
    filetype = str(row['filetype']).strip().lstrip('.')
    latest_fp = folder / f'latest.{filetype}'
    dated_fp = folder / f'{today:%Y-%m-%d}.{filetype}'
    if dated_fp.exists():
        cat.at[idx, 'last_fetched'] = today.isoformat()
        continue
    last_fetched = (
        pd.to_datetime(row["last_fetched"]).date()
        if pd.notna(row["last_fetched"]) else None
    )

    # ---- Determine if an update is due --------------------------------------
    cadence = str(row["cadence"]).lower().strip()
    min_age = CADENCE_DAYS.get(cadence, 30)        # default 30 days
    needs_update = (
        (not latest_fp.exists()) or
        (not last_fetched) or
        (today - last_fetched).days >= min_age
    )

    if not needs_update:
        print(f"Skipping {row['folder']} – up to date")
        continue

    # ---- Build the request URL ---------------------------------------------
    url = substitute_date_tokens(str(row["url"]))
    url = add_apikey(url, str(row.get('api_key') or '').strip() or None)

    print(f"Fetching {row['folder']} …", end=" ")
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        if filetype.lower() == 'json':
            try:
                data_json = r.json()
            except Exception:
                data_json = None
            if isinstance(data_json, dict) and data_json.get('error_message'):
                raise ValueError(data_json['error_message'])
        # ---- Save snapshot and latest --------------------------------------
        if latest_fp.exists() and latest_fp.read_bytes() == r.content:
            cat.at[idx, 'last_fetched'] = today.isoformat()
            print('no change')
            continue
        dated_fp.write_bytes(r.content)
        shutil.copyfile(dated_fp, latest_fp)

        # ---- Mark success in catalog ---------------------------------------
        cat.at[idx, "last_fetched"] = today.isoformat()
        updated_rows.append(row["folder"])
        print("✓ success")
    except Exception as e:
        print(f"✗ failed: {e}")

# --- Persist catalog if anything changed -------------------------------------
if updated_rows:
    cat.to_csv(catalog_path, index=False)
    print("\nUpdated:", ", ".join(updated_rows))
else:
    print("Everything up to date.")


All dependencies ready.

Fetching MUNI_Citatios … ✓ success

Updated: MUNI_Citatios


In [None]:
# This cell updates the markdown index files for all the data sources

