# Update Script
This notebook orchestrates data downloads and analysis refreshes.

In [1]:
# ========== Bootstrap: ensure required Python packages are present ==========
import importlib, subprocess, sys
from typing import Optional


def _ensure(pkg_name: str, import_name: Optional[str] = None, required: bool = True):
    """Import a module, installing it if necessary. If installation fails and
    the package is required, the exception is raised. Optional packages may
    remain unavailable."""
    try:
        return importlib.import_module(import_name or pkg_name)
    except ModuleNotFoundError:
        print(f"Package '{pkg_name}' not found - installing ...")
        try:
            subprocess.check_call([sys.executable, '-m', 'pip', 'install', pkg_name])
        except Exception as e:
            print(f"Failed to install {pkg_name}: {e}")
            if required:
                raise
    try:
        mod = importlib.import_module(import_name or pkg_name)
        globals()[import_name or pkg_name] = mod
        return mod
    except ModuleNotFoundError:
        if required:
            raise
        print(f"Package '{pkg_name}' is unavailable.")
        globals()[import_name or pkg_name] = None
        return None
# --- Required third-party libraries ------------------------------------------
_ensure("pandas")
_ensure("requests")
_ensure("feedparser")
_ensure("textblob")
_ensure("jupyter", required=False)
_ensure("nbconvert", required=False)
print("All dependencies ready.\n")

# --- Standard imports --------------------------------------------------------
from pathlib import Path
import datetime as dt
import os, re, shutil, json, feedparser, textblob
import pandas as pd, requests, urllib.parse

# --- Helper: replace [date %Y-%m-%d] tokens -----------------------------------
def substitute_date_tokens(url: str) -> str:
    def _replace(m):
        fmt = m.group(1).strip()
        return dt.date.today().strftime(fmt)
    return re.sub(r"\[date\s+([^\]]+)\]", _replace, url)

# --- Helper: append API key if specified -----------------------------------
def add_apikey(url: str, env_var: Optional[str]) -> str:
    if env_var and str(env_var).lower() != "nan":
        key = os.getenv(env_var)
        if key:
            sep = '&' if '?' in url else '?'
            return f'{url}{sep}api_key={urllib.parse.quote_plus(key)}'
        else:
            print(f"Warning: environment variable '{env_var}' not set.")
    return url

# --- Cadence map (word → minimum seconds between fetches) ------------------------
CADENCE_SECONDS = {
    "hourly": 3600,
    "daily": 86400,
    "weekly": 604800,
    "monthly": 2592000,
    "quarterly": 7776000,
}

# --- Resolve base directory so notebook works from repo root or data folder ---
BASE_DIR = Path.cwd() if Path('catalog.csv').exists() else Path.cwd() / 'data'

# --- Load catalog -------------------------------------------------------------
catalog_path = BASE_DIR / 'catalog.csv'
cat = pd.read_csv(catalog_path)
cat['filetype'] = cat['filetype'].astype(str).str.strip().str.lstrip('.')

now = dt.datetime.now()
today = now.date()
updated_rows = []                # remember which rows we refresh

for idx, row in cat.iterrows():
    folder = BASE_DIR / str(row['category']) / str(row['source']) / str(row['folder'])
    folder.mkdir(parents=True, exist_ok=True)
    cadence = str(row['cadence']).lower().strip()
    filetype = str(row['filetype']).strip().lstrip('.')
    output_ext = 'json' if filetype.lower() in ('rss', 'xml') else filetype
    latest_fp = folder / f'latest.{output_ext}'
    url = str(row.get('url', '')).strip()
    if not url or url.lower() in ('n/a', 'na', 'none'):
        print(f"Skipping {row['folder']} (static)")
        continue
    dated_fp = folder / f"{now:%Y-%m-%d-%H}.{output_ext}" if cadence == "hourly" else folder / f"{today:%Y-%m-%d}.{output_ext}"
    if dated_fp.exists():
        if (not latest_fp.exists()) or latest_fp.read_bytes() != dated_fp.read_bytes():
            shutil.copyfile(dated_fp, latest_fp)
        cat.at[idx, 'last_fetched'] = now.isoformat(timespec='minutes')
        continue
    last_fetched = (
        pd.to_datetime(row["last_fetched"])
        if pd.notna(row["last_fetched"]) else None
    )

    # ---- Determine if an update is due --------------------------------------
    cadence = str(row["cadence"]).lower().strip()
    min_age = CADENCE_SECONDS.get(cadence, 30*86400)        # default 30 days
    needs_update = (
        (not latest_fp.exists()) or
        (not last_fetched) or
        (now - last_fetched).total_seconds() >= min_age
    )

    #if not needs_update:
        #print(f"Skipping {row['folder']} - up to date")
        #continue

    # ---- Build the request URL ---------------------------------------------
    url = substitute_date_tokens(str(row["url"]))
    url = add_apikey(url, str(row.get('api_key') or '').strip() or None)

    print(f"Fetching {row['folder']} ({row['url']})…", end=" ")
    try:
        r = requests.get(url, timeout=30, headers={'User-Agent': 'Mozilla/5.0'})
        r.raise_for_status()
        if filetype.lower() in ('rss', 'xml'):
            feed = feedparser.parse(r.content)
            entries = []
            for e in feed.entries:
                text = ' '.join(filter(None, [e.get('title'), e.get('summary')]))
                polarity = textblob.TextBlob(text).sentiment.polarity
                entries.append({'title': e.get('title'), 'link': e.get('link'),
                               'published': e.get('published'),
                               'sentiment': polarity})
            content_bytes = json.dumps({'entries': entries}, ensure_ascii=False, indent=2).encode('utf-8')
        else:
            content_bytes = r.content
        if filetype.lower() == 'json':
            try:
                data_json = r.json()
            except Exception:
                data_json = None
            if isinstance(data_json, dict) and data_json.get('error_message'):
                raise ValueError(data_json['error_message'])
        # ---- Save snapshot and latest --------------------------------------
        if latest_fp.exists() and latest_fp.read_bytes() == content_bytes:
            cat.at[idx, 'last_fetched'] = now.isoformat(timespec='minutes')
            print('no change')
            continue
        dated_fp.write_bytes(content_bytes)
        shutil.copyfile(dated_fp, latest_fp)

        # ---- Mark success in catalog ---------------------------------------
        cat.at[idx, "last_fetched"] = now.isoformat(timespec='minutes')
        updated_rows.append(row["folder"])
        print("✓ success")
    except Exception as e:
        print(f"✗ failed: {e}")

# --- Persist catalog if anything changed -------------------------------------
if updated_rows:
    cat.to_csv(catalog_path, index=False)
    print("\nUpdated:", ", ".join(updated_rows))
else:
    print("Everything up to date.")

    

Package 'pandas' not found - installing ...


Collecting pandas
  Downloading pandas-2.3.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (91 kB)


Collecting numpy>=1.26.0 (from pandas)
  Downloading numpy-2.3.1-cp313-cp313-manylinux_2_28_x86_64.whl.metadata (62 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.3.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/12.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m201.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-2.3.1-cp313-cp313-manylinux_2_28_x86_64.whl (16.6 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/16.6 MB[0m [31m?[0m eta [36m-:--:--[0m

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.6/16.6 MB[0m [31m132.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pytz-2025.2-py2.py3-none-any.whl (509 kB)
Downloading tzdata-2025.2-py2.py3-none-any.whl (347 kB)


Installing collected packages: pytz, tzdata, numpy, pandas
[?25l[2K   [91m━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/4[0m [tzdata]

[2K   [91m━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━[0m [32m2/4[0m [numpy][2K   [91m━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━[0m [32m2/4[0m [numpy]

[2K   [91m━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━[0m [32m2/4[0m [numpy][2K   [91m━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━[0m [32m2/4[0m [numpy]

[2K   [91m━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━[0m [32m2/4[0m [numpy][2K   [91m━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━[0m [32m2/4[0m [numpy]

[2K   [91m━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━[0m [32m2/4[0m [numpy][2K   [91m━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━[0m [32m2/4[0m [numpy]

[2K   [91m━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━[0m [32m2/4[0m [numpy][2K   [91m━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━[0m [32m2/4[0m [numpy]

[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━[0m [32m3/4[0m [pandas][2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━[0m [32m3/4[0m [pandas]

[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━[0m [32m3/4[0m [pandas][2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━[0m [32m3/4[0m [pandas]

[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━[0m [32m3/4[0m [pandas][2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━[0m [32m3/4[0m [pandas]

[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━[0m [32m3/4[0m [pandas][2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━[0m [32m3/4[0m [pandas]

[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━[0m [32m3/4[0m [pandas][2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━[0m [32m3/4[0m [pandas]

[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━[0m [32m3/4[0m [pandas][2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━[0m [32m3/4[0m [pandas]

[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━[0m [32m3/4[0m [pandas][2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━[0m [32m3/4[0m [pandas]

[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━[0m [32m3/4[0m [pandas][2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━[0m [32m3/4[0m [pandas]

[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━[0m [32m3/4[0m [pandas][2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━[0m [32m3/4[0m [pandas]

[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━[0m [32m3/4[0m [pandas][2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━[0m [32m3/4[0m [pandas]

[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━[0m [32m3/4[0m [pandas][2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━[0m [32m3/4[0m [pandas]

[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━[0m [32m3/4[0m [pandas][2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━[0m [32m3/4[0m [pandas]

[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━[0m [32m3/4[0m [pandas][2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━[0m [32m3/4[0m [pandas]

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4/4[0m [pandas]
[?25h[1A[2KSuccessfully installed numpy-2.3.1 pandas-2.3.1 pytz-2025.2 tzdata-2025.2


Package 'feedparser' not found - installing ...


Collecting feedparser
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting sgmllib3k (from feedparser)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Installing build dependencies: started


  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started


  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Downloading feedparser-6.0.11-py3-none-any.whl (81 kB)


Building wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (pyproject.toml): started


  Building wheel for sgmllib3k (pyproject.toml): finished with status 'done'
  Created wheel for sgmllib3k: filename=sgmllib3k-1.0.0-py3-none-any.whl size=6089 sha256=a6d69e3ebc1c33a078edbae3d21a8323022bf31d70aebf3bdb7f6481a19771e7
  Stored in directory: /home/runner/.cache/pip/wheels/3d/4d/ef/37cdccc18d6fd7e0dd7817dcdf9146d4d6789c32a227a28134
Successfully built sgmllib3k
Installing collected packages: sgmllib3k, feedparser
[?25l[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [feedparser]
[?25h[1A[2KSuccessfully installed feedparser-6.0.11 sgmllib3k-1.0.0


Package 'textblob' not found - installing ...


Collecting textblob
  Downloading textblob-0.19.0-py3-none-any.whl.metadata (4.4 kB)
Collecting nltk>=3.9 (from textblob)
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting click (from nltk>=3.9->textblob)
  Downloading click-8.2.1-py3-none-any.whl.metadata (2.5 kB)
Collecting joblib (from nltk>=3.9->textblob)
  Downloading joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)


Collecting regex>=2021.8.3 (from nltk>=3.9->textblob)
  Downloading regex-2024.11.6-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Collecting tqdm (from nltk>=3.9->textblob)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Downloading textblob-0.19.0-py3-none-any.whl (624 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/624.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m624.3/624.3 kB[0m [31m53.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.5 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m110.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading regex-2024.11.6-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (796 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32

Installing collected packages: tqdm, regex, joblib, click, nltk, textblob
[?25l[2K   [91m━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/6[0m [joblib]

[2K   [91m━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/6[0m [joblib][2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━[0m [32m4/6[0m [nltk]

[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━[0m [32m4/6[0m [nltk][2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━[0m [32m4/6[0m [nltk]

[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━[0m [32m4/6[0m [nltk][2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━[0m [32m4/6[0m [nltk]

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6/6[0m [textblob]
[?25h[1A[2KSuccessfully installed click-8.2.1 joblib-1.5.1 nltk-3.9.1 regex-2024.11.6 textblob-0.19.0 tqdm-4.67.1


All dependencies ready.

Fetching GDPC1 (https://api.stlouisfed.org/fred/series/observations?series_id=GDPC1&file_type=json&observation_end=[date %Y-%m-%d])… 

✗ failed: 400 Client Error: Bad Request for url: https://api.stlouisfed.org/fred/series/observations?series_id=GDPC1&file_type=json&observation_end=2025-07-16
Fetching A939RX0Q048SBEA (https://api.stlouisfed.org/fred/series/observations?series_id=A939RX0Q048SBEA&file_type=json&observation_end=[date %Y-%m-%d])… ✗ failed: 400 Client Error: Bad Request for url: https://api.stlouisfed.org/fred/series/observations?series_id=A939RX0Q048SBEA&file_type=json&observation_end=2025-07-16
Fetching M2REAL (https://api.stlouisfed.org/fred/series/observations?series_id=M2REAL&file_type=json&observation_end=[date %Y-%m-%d])… 

✗ failed: 400 Client Error: Bad Request for url: https://api.stlouisfed.org/fred/series/observations?series_id=M2REAL&file_type=json&observation_end=2025-07-16
Fetching UNRATE (https://api.stlouisfed.org/fred/series/observations?series_id=UNRATE&file_type=json&observation_end=[date %Y-%m-%d])… ✗ failed: 400 Client Error: Bad Request for url: https://api.stlouisfed.org/fred/series/observations?series_id=UNRATE&file_type=json&observation_end=2025-07-16
Fetching CLVMNACSCAB1GQDE (https://api.stlouisfed.org/fred/series/observations?series_id=CLVMNACSCAB1GQDE&file_type=json&observation_end=[date %Y-%m-%d])… 

✗ failed: 400 Client Error: Bad Request for url: https://api.stlouisfed.org/fred/series/observations?series_id=CLVMNACSCAB1GQDE&file_type=json&observation_end=2025-07-16
Fetching GFDEBTN (https://api.stlouisfed.org/fred/series/observations?series_id=GFDEBTN&file_type=json&observation_end=[date %Y-%m-%d])… ✗ failed: 400 Client Error: Bad Request for url: https://api.stlouisfed.org/fred/series/observations?series_id=GFDEBTN&file_type=json&observation_end=2025-07-16
Fetching GFDEGDQ188S (https://api.stlouisfed.org/fred/series/observations?series_id=GFDEGDQ188S&file_type=json&observation_end=[date %Y-%m-%d])… 

✗ failed: 400 Client Error: Bad Request for url: https://api.stlouisfed.org/fred/series/observations?series_id=GFDEGDQ188S&file_type=json&observation_end=2025-07-16
Fetching TDSP (https://api.stlouisfed.org/fred/series/observations?series_id=TDSP&file_type=json&observation_end=[date %Y-%m-%d])… ✗ failed: 400 Client Error: Bad Request for url: https://api.stlouisfed.org/fred/series/observations?series_id=TDSP&file_type=json&observation_end=2025-07-16
Fetching news-us-nyt (https://rss.nytimes.com/services/xml/rss/nyt/US.xml)… 

✓ success
Fetching news-world-nyt (https://rss.nytimes.com/services/xml/rss/nyt/World.xml)… ✓ success
Fetching news-africa-nyt (https://rss.nytimes.com/services/xml/rss/nyt/Africa.xml)… 

✓ success
Fetching news-europe-nyt (https://rss.nytimes.com/services/xml/rss/nyt/Europe.xml)… 

✓ success
Fetching news-asia-nyt (https://rss.nytimes.com/services/xml/rss/nyt/AsiaPacific.xml)… 

✓ success
Fetching news-americas-nyt (https://rss.nytimes.com/services/xml/rss/nyt/Americas.xml)… 

✓ success
Fetching news-middle-east-nyt (https://rss.nytimes.com/services/xml/rss/nyt/MiddleEast.xml)… 

✓ success
Fetching news-business-nyt (https://rss.nytimes.com/services/xml/rss/nyt/Business.xml)… ✓ success
Fetching news-economy-nyt (https://rss.nytimes.com/services/xml/rss/nyt/Economy.xml)… no change
Fetching news-us-politics-nyt (https://rss.nytimes.com/services/xml/rss/nyt/Politics.xml)… ✓ success
Fetching news-world-wsj (https://feeds.content.dowjones.io/public/rss/RSSWorldNews)… 

✓ success
Fetching news-us-wsj (https://feeds.content.dowjones.io/public/rss/RSSUSNews)… no change
Fetching news-business-wsj (https://feeds.content.dowjones.io/public/rss/WSJcomUSBusiness)… ✓ success
Fetching news-markets-wsj (https://feeds.content.dowjones.io/public/rss/RSSMarketsMain)… 

✓ success
Fetching news-economy-wsj (https://feeds.content.dowjones.io/public/rss/socialeconomyfeed)… ✓ success
Fetching news-us-politics-wsj (https://feeds.content.dowjones.io/public/rss/socialpoliticsfeed)… 

✓ success
Fetching news-us-politics-wapo (https://www.washingtonpost.com/arcio/rss/category/politics/)… 

✗ failed: HTTPSConnectionPool(host='www.washingtonpost.com', port=443): Read timed out. (read timeout=30)
Fetching news-us-wapo (http://feeds.washingtonpost.com/rss/national)… 

✓ success
Fetching news-world-wapo (https://feeds.washingtonpost.com/rss/world)… 

no change
Fetching news-business-wapo (http://feeds.washingtonpost.com/rss/business)… 

no change
Fetching latimes-business (https://www.latimes.com/business/rss2.0.xml)… 

✓ success
Fetching latimes-us (https://www.latimes.com/local/rss2.0.xml)… ✓ success
Fetching latimes-us-politics (https://www.latimes.com/politics/rss2.0.xml)… 

✓ success
Fetching news-world-chi-tribune (https://www.chicagotribune.com/arc/outboundfeeds/rss/section/nation-world/&sort=display_date:desc)… 

no change
Fetching news-business-chi-tribune (https://www.chicagotribune.com/arc/outboundfeeds/rss/section/business/&sort=display_date:desc)… 

no change
Fetching news-us-politics-chi-tribune (https://www.chicagotribune.com/arc/outboundfeeds/rss/section/politics/&sort=display_date:desc/)… 

no change
Fetching news-us-business-startribune (https://www.startribune.com/rss?sf=1&s=%2F)… ✓ success
Fetching news-us-politics-startribune (https://www.startribune.com/politics/index.rss2)… 

✗ failed: 404 Client Error: Not Found for url: https://www.startribune.com/politics/index.rss2
Fetching news-us-nypost (https://nypost.com/us-news/feed/)… 

✓ success
Fetching news-world-nypost (https://nypost.com/world-news/feed/)… 

✓ success
Fetching news-us-politics-nypost (https://nypost.com/politics/feed/)… 

✓ success
Fetching news-business-nypost (https://nypost.com/business/feed/)… 

✓ success
Fetching news-world-toi (http://timesofindia.indiatimes.com/rssfeeds/296589292.cms)… ✓ success
Fetching news-business-toi (http://timesofindia.indiatimes.com/rssfeeds/1898055.cms)… ✓ success
Fetching news-us-toi (https://timesofindia.indiatimes.com/rssfeeds_us/72258322.cms)… 

no change
Fetching news-middle-east-toi (http://timesofindia.indiatimes.com/rssfeeds/1898272.cms)… ✓ success
Fetching news-europe-toi (http://timesofindia.indiatimes.com/rssfeeds/1898274.cms)… 

no change
Fetching news-world-cbc (https://www.cbc.ca/webfeed/rss/rss-world)… ✓ success
Fetching news-politics-cbc (https://www.cbc.ca/webfeed/rss/rss-politics)… 

✓ success
Fetching news-africa-bbc (http://feeds.bbci.co.uk/news/world/africa/rss.xml)… 

✓ success
Fetching news-asia-bbc (http://feeds.bbci.co.uk/news/world/asia/rss.xml)… 

no change
Fetching news-europe-bbc (http://feeds.bbci.co.uk/news/world/europe/rss.xml)… no change
Fetching news-latin-america-bbc (http://feeds.bbci.co.uk/news/world/latin_america/rss.xml)… 

✓ success
Fetching news-middle-east-bbc (http://feeds.bbci.co.uk/news/world/middle_east/rss.xml)… 

no change
Fetching news-us-bbc (http://feeds.bbci.co.uk/news/world/us_and_canada/rss.xml)… ✓ success
Fetching news-world-bbc (https://feeds.bbci.co.uk/news/world/rss.xml)… 

✓ success
Fetching news-business-bbc (https://feeds.bbci.co.uk/news/business/rss.xml)… 

✓ success
Fetching news-politics-bbc (https://feeds.bbci.co.uk/news/politics/rss.xml)… 

✓ success
Fetching news-top-dw (https://rss.dw.com/rdf/rss-en-all)… 

✓ success
Fetching news-europe-dw (https://rss.dw.com/rdf/rss-en-eu)… no change
Fetching news-world-dw (https://rss.dw.com/rdf/rss-en-world)… 

✓ success
Fetching news-business-dw (https://rss.dw.com/rdf/rss-en-bus)… no change
Fetching news-asia-dw (https://rss.dw.com/rdf/rss-en-asia)… 

✓ success
Fetching zip-demo-ca (nan)… ✗ failed: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?

Updated: news-us-nyt, news-world-nyt, news-africa-nyt, news-europe-nyt, news-asia-nyt, news-americas-nyt, news-middle-east-nyt, news-business-nyt, news-us-politics-nyt, news-world-wsj, news-business-wsj, news-markets-wsj, news-economy-wsj, news-us-politics-wsj, news-us-wapo, latimes-business, latimes-us, latimes-us-politics, news-us-business-startribune, news-us-nypost, news-world-nypost, news-us-politics-nypost, news-business-nypost, news-world-toi, news-business-toi, news-middle-east-toi, news-world-cbc, news-politics-cbc, news-africa-bbc, news-latin-america-bbc, news-us-bbc, news-world-bbc, news-business-bbc, news-politics-bbc, news-top-dw, news-world-dw, news-asia-dw


In [2]:
# This cell updates the markdown index files for all the data sources
from pathlib import Path
import csv
import re

BASE_DIR = Path.cwd() if Path('catalog.csv').exists() else Path.cwd() / 'data'
with open(BASE_DIR / 'catalog.csv', newline='') as f:
    cat = list(csv.DictReader(f))

for row in cat:
    folder = BASE_DIR / row['category'] / row['source'] / row['folder']
    folder.mkdir(parents=True, exist_ok=True)
    filetype = row['filetype'].strip().lstrip('.')
    output_ext = 'json' if filetype.lower() in ('rss', 'xml') else filetype
    desc = row['description'].strip()
    source = row['source'].strip()
    date = row.get('last_fetched', '').strip()

    pattern = re.compile(r"\d{4}-\d{2}-\d{2}(?:-\d{2})?\." + re.escape(output_ext) + r"$")
    dated_files = sorted(p.name for p in folder.iterdir() if pattern.match(p.name))

    lines = [
        '---',
        'layout: default',
        f'title: {source} - {desc}',
        f'date: {date}',
        '---',
        '',
        f'## {source} - {desc}',
        '',
        '<div id="data-chart"></div>',
        '<div id="data-table"></div>',
    ]

    if row['source'] == 'fred' and filetype == 'json':
        lines += [
            '<script>',
            "document.addEventListener('DOMContentLoaded', function(){",
            "  ShowChart($('#data-chart'));",
            "  SourceTabler($('#data-table'));",
            "});",
            '</script>',
        ]
    else:
        lines += [
            '<script>',
            "document.addEventListener('DOMContentLoaded', function(){",
            "  document.getElementById('data-table').textContent = 'This source isn\'t supported for tables yet.';",
            "});",
            '</script>',
        ]

    lines += [
        '',
        '## File Versions:',
    ]
    links = [f'[Latest version](./latest.{output_ext})'] + [f'[{fname}](./{fname})' for fname in dated_files]
    for i, link in enumerate(links, 1):
        lines.append(f'{i}. {link}')
    (folder / 'index.md').write_text("\n".join(lines) + "\n")

print('Index files generated for', ', '.join(r['folder'] for r in cat))


Index files generated for GDPC1, A939RX0Q048SBEA, M2REAL, UNRATE, CLVMNACSCAB1GQDE, GFDEBTN, GFDEGDQ188S, TDSP, news-us-nyt, news-world-nyt, news-africa-nyt, news-europe-nyt, news-asia-nyt, news-americas-nyt, news-middle-east-nyt, news-business-nyt, news-economy-nyt, news-us-politics-nyt, news-world-wsj, news-us-wsj, news-business-wsj, news-markets-wsj, news-economy-wsj, news-us-politics-wsj, news-us-politics-wapo, news-us-wapo, news-world-wapo, news-business-wapo, latimes-business, latimes-us, latimes-us-politics, news-world-chi-tribune, news-business-chi-tribune, news-us-politics-chi-tribune, news-us-business-startribune, news-us-politics-startribune, news-us-nypost, news-world-nypost, news-us-politics-nypost, news-business-nypost, news-world-toi, news-business-toi, news-us-toi, news-middle-east-toi, news-europe-toi, news-world-cbc, news-politics-cbc, news-africa-bbc, news-asia-bbc, news-europe-bbc, news-latin-america-bbc, news-middle-east-bbc, news-us-bbc, news-world-bbc, news-busin

In [3]:

# Update outdated notebooks until none remain
import json, re, time, subprocess, sys
from pathlib import Path

repo_dir = Path.cwd()
if not (repo_dir / 'analysis').is_dir():
    repo_dir = repo_dir.parent
analysis_dir = repo_dir / 'analysis'
data_dir = repo_dir / 'data'

pattern = re.compile(r'[A-Za-z0-9_/.-]*latest\.(?:csv|json|xml|rss)')

def mtime_str(p: Path) -> str:
    return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(p.stat().st_mtime))

def build_dep_map():
    ipynb_paths = sorted(analysis_dir.rglob('*.ipynb'))
    dep_map = {}
    for nb in ipynb_paths:
        text = nb.read_text()
        matches = sorted(set(pattern.findall(text)))
        deps = []
        for m in matches:
            dep = (nb.parent / m).resolve()
            if not dep.exists():
                dep = (repo_dir / m.lstrip('./')).resolve()
            if dep.exists():
                deps.append(dep)
        dep_map[nb] = deps
    return dep_map

def outdated(nb, deps):
    nb_mtime = nb.stat().st_mtime
    return any(d.stat().st_mtime > nb_mtime for d in deps)


def execute(nb: Path):
    import shutil
    rel = str(nb.relative_to(repo_dir))
    if not shutil.which('jupyter'):
        print(f'jupyter not available - skipping {rel}')
        return
    print(f'Running {rel} …')
    cmd=[sys.executable,'-m','jupyter','nbconvert','--to','notebook','--inplace','--execute','--ExecutePreprocessor.timeout=600','--debug',str(nb)]
    print('  Command:', ' '.join(cmd))
    proc=subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
    print(proc.stdout)
    if proc.returncode==0:
        print('  ✓ success')
    else:
        print(f'  ✗ failed with exit code {proc.returncode}')
while True:
    dep_map=build_dep_map()
    outdated_nbs=[nb for nb,deps in dep_map.items() if deps and outdated(nb,deps)]
    report={'outdated_notebooks':[str(nb.relative_to(repo_dir)) for nb in outdated_nbs]}
    deps_path = repo_dir / 'dependencies.json'
    deps_path.write_text(
        json.dumps(report, indent=2) + "\n"
    )
    if not outdated_nbs:
        print('Everything up to date.')
        break
    for nb in outdated_nbs:
        execute(nb)


Running analysis/headlines/.ipynb_checkpoints/update_headlines-checkpoint.ipynb …
  Command: /opt/hostedtoolcache/Python/3.13.5/x64/bin/python -m jupyter nbconvert --to notebook --inplace --execute --ExecutePreprocessor.timeout=600 --debug /home/runner/work/Analysis/Analysis/analysis/headlines/.ipynb_checkpoints/update_headlines-checkpoint.ipynb


[NbConvertApp] Searching ['/home/runner/.jupyter', '/home/runner/.local/etc/jupyter', '/opt/hostedtoolcache/Python/3.13.5/x64/etc/jupyter', '/usr/local/etc/jupyter', '/etc/jupyter'] for config files
[NbConvertApp] Looking for jupyter_config in /etc/jupyter
[NbConvertApp] Looking for jupyter_config in /usr/local/etc/jupyter
[NbConvertApp] Looking for jupyter_config in /opt/hostedtoolcache/Python/3.13.5/x64/etc/jupyter
[NbConvertApp] Looking for jupyter_config in /home/runner/.local/etc/jupyter
[NbConvertApp] Looking for jupyter_config in /home/runner/.jupyter
[NbConvertApp] Looking for jupyter_nbconvert_config in /etc/jupyter
[NbConvertApp] Looking for jupyter_nbconvert_config in /usr/local/etc/jupyter
[NbConvertApp] Looking for jupyter_nbconvert_config in /opt/hostedtoolcache/Python/3.13.5/x64/etc/jupyter
[NbConvertApp] Looking for jupyter_nbconvert_config in /home/runner/.local/etc/jupyter
[NbConvertApp] Looking for jupyter_nbconvert_config in /home/runner/.jupyter
[NbConvertApp] Loop

[NbConvertApp] Searching ['/home/runner/.jupyter', '/home/runner/.local/etc/jupyter', '/opt/hostedtoolcache/Python/3.13.5/x64/etc/jupyter', '/usr/local/etc/jupyter', '/etc/jupyter'] for config files
[NbConvertApp] Looking for jupyter_config in /etc/jupyter
[NbConvertApp] Looking for jupyter_config in /usr/local/etc/jupyter
[NbConvertApp] Looking for jupyter_config in /opt/hostedtoolcache/Python/3.13.5/x64/etc/jupyter
[NbConvertApp] Looking for jupyter_config in /home/runner/.local/etc/jupyter
[NbConvertApp] Looking for jupyter_config in /home/runner/.jupyter
[NbConvertApp] Looking for jupyter_nbconvert_config in /etc/jupyter
[NbConvertApp] Looking for jupyter_nbconvert_config in /usr/local/etc/jupyter
[NbConvertApp] Looking for jupyter_nbconvert_config in /opt/hostedtoolcache/Python/3.13.5/x64/etc/jupyter
[NbConvertApp] Looking for jupyter_nbconvert_config in /home/runner/.local/etc/jupyter
[NbConvertApp] Looking for jupyter_nbconvert_config in /home/runner/.jupyter
[NbConvertApp] Loop

[NbConvertApp] Searching ['/home/runner/.jupyter', '/home/runner/.local/etc/jupyter', '/opt/hostedtoolcache/Python/3.13.5/x64/etc/jupyter', '/usr/local/etc/jupyter', '/etc/jupyter'] for config files
[NbConvertApp] Looking for jupyter_config in /etc/jupyter
[NbConvertApp] Looking for jupyter_config in /usr/local/etc/jupyter
[NbConvertApp] Looking for jupyter_config in /opt/hostedtoolcache/Python/3.13.5/x64/etc/jupyter
[NbConvertApp] Looking for jupyter_config in /home/runner/.local/etc/jupyter
[NbConvertApp] Looking for jupyter_config in /home/runner/.jupyter
[NbConvertApp] Looking for jupyter_nbconvert_config in /etc/jupyter
[NbConvertApp] Looking for jupyter_nbconvert_config in /usr/local/etc/jupyter
[NbConvertApp] Looking for jupyter_nbconvert_config in /opt/hostedtoolcache/Python/3.13.5/x64/etc/jupyter
[NbConvertApp] Looking for jupyter_nbconvert_config in /home/runner/.local/etc/jupyter
[NbConvertApp] Looking for jupyter_nbconvert_config in /home/runner/.jupyter
[NbConvertApp] Loop