# Update Script
This notebook orchestrates data downloads and analysis refreshes.

In [2]:
# ========== Bootstrap: ensure required Python packages are present ==========
import importlib, subprocess, sys
from typing import Optional


def _ensure(pkg_name: str, import_name: Optional[str] = None, required: bool = True):
    """Import a module, installing it if necessary. If installation fails and
    the package is required, the exception is raised. Optional packages may
    remain unavailable."""
    try:
        return importlib.import_module(import_name or pkg_name)
    except ModuleNotFoundError:
        print(f"Package '{pkg_name}' not found - installing ...")
        try:
            subprocess.check_call([sys.executable, '-m', 'pip', 'install', pkg_name])
        except Exception as e:
            print(f"Failed to install {pkg_name}: {e}")
            if required:
                raise
    try:
        mod = importlib.import_module(import_name or pkg_name)
        globals()[import_name or pkg_name] = mod
        return mod
    except ModuleNotFoundError:
        if required:
            raise
        print(f"Package '{pkg_name}' is unavailable.")
        globals()[import_name or pkg_name] = None
        return None
# --- Required third-party libraries ------------------------------------------
_ensure("pandas")
_ensure("requests")
_ensure("feedparser")
_ensure("textblob")
_ensure("jupyter", required=False)
_ensure("nbconvert", required=False)
print("All dependencies ready.\n")

# --- Standard imports --------------------------------------------------------
from pathlib import Path
import datetime as dt
import os, re, shutil, json
import pandas as pd, requests, urllib.parse

# --- Helper: replace [date %Y-%m-%d] tokens -----------------------------------
def substitute_date_tokens(url: str) -> str:
    def _replace(m):
        fmt = m.group(1).strip()
        return dt.date.today().strftime(fmt)
    return re.sub(r"\[date\s+([^\]]+)\]", _replace, url)

# --- Helper: append API key if specified -----------------------------------
def add_apikey(url: str, env_var: Optional[str]) -> str:
    if env_var and str(env_var).lower() != "nan":
        key = os.getenv(env_var)
        if key:
            sep = '&' if '?' in url else '?'
            return f'{url}{sep}api_key={urllib.parse.quote_plus(key)}'
        else:
            print(f"Warning: environment variable '{env_var}' not set.")
    return url

# --- Cadence map (word → minimum seconds between fetches) ------------------------
CADENCE_SECONDS = {
    "hourly": 3600,
    "daily": 86400,
    "weekly": 604800,
    "monthly": 2592000,
    "quarterly": 7776000,
}

# --- Resolve base directory so notebook works from repo root or data folder ---
BASE_DIR = Path.cwd() if Path('catalog.csv').exists() else Path.cwd() / 'data'

# --- Load catalog -------------------------------------------------------------
catalog_path = BASE_DIR / 'catalog.csv'
cat = pd.read_csv(catalog_path)
cat['filetype'] = cat['filetype'].astype(str).str.strip().str.lstrip('.')

now = dt.datetime.now()
today = now.date()
updated_rows = []                # remember which rows we refresh

for idx, row in cat.iterrows():
    folder = BASE_DIR / str(row['category']) / str(row['source']) / str(row['folder'])
    folder.mkdir(parents=True, exist_ok=True)
    cadence = str(row['cadence']).lower().strip()
    filetype = str(row['filetype']).strip().lstrip('.')
    output_ext = 'json' if filetype.lower() in ('rss', 'xml') else filetype
    latest_fp = folder / f'latest.{output_ext}'
    dated_fp = folder / f"{now:%Y-%m-%d-%H}.{output_ext}" if cadence == "hourly" else folder / f"{today:%Y-%m-%d}.{output_ext}"
    if dated_fp.exists():
        if (not latest_fp.exists()) or latest_fp.read_bytes() != dated_fp.read_bytes():
            shutil.copyfile(dated_fp, latest_fp)
        cat.at[idx, 'last_fetched'] = now.isoformat(timespec='minutes')
        continue
    last_fetched = (
        pd.to_datetime(row["last_fetched"])
        if pd.notna(row["last_fetched"]) else None
    )

    # ---- Determine if an update is due --------------------------------------
    cadence = str(row["cadence"]).lower().strip()
    min_age = CADENCE_SECONDS.get(cadence, 30*86400)        # default 30 days
    needs_update = (
        (not latest_fp.exists()) or
        (not last_fetched) or
        (now - last_fetched).total_seconds() >= min_age
    )

    if not needs_update:
        print(f"Skipping {row['folder']} - up to date")
        continue

    # ---- Build the request URL ---------------------------------------------
    url = substitute_date_tokens(str(row["url"]))
    url = add_apikey(url, str(row.get('api_key') or '').strip() or None)

    print(f"Fetching {row['folder']} …", end=" ")
    try:
        r = requests.get(url, timeout=30, headers={'User-Agent': 'Mozilla/5.0'})
        r.raise_for_status()
        if filetype.lower() in ('rss', 'xml'):
            feed = feedparser.parse(r.content)
            entries = []
            for e in feed.entries:
                text = ' '.join(filter(None, [e.get('title'), e.get('summary')]))
                polarity = textblob.TextBlob(text).sentiment.polarity
                entries.append({'title': e.get('title'), 'link': e.get('link'),
                               'published': e.get('published'),
                               'sentiment': polarity})
            content_bytes = json.dumps({'entries': entries}, ensure_ascii=False, indent=2).encode('utf-8')
        else:
            content_bytes = r.content
        if filetype.lower() == 'json':
            try:
                data_json = r.json()
            except Exception:
                data_json = None
            if isinstance(data_json, dict) and data_json.get('error_message'):
                raise ValueError(data_json['error_message'])
        # ---- Save snapshot and latest --------------------------------------
        if latest_fp.exists() and latest_fp.read_bytes() == content_bytes:
            cat.at[idx, 'last_fetched'] = now.isoformat(timespec='minutes')
            print('no change')
            continue
        dated_fp.write_bytes(content_bytes)
        shutil.copyfile(dated_fp, latest_fp)

        # ---- Mark success in catalog ---------------------------------------
        cat.at[idx, "last_fetched"] = now.isoformat(timespec='minutes')
        updated_rows.append(row["folder"])
        print("✓ success")
    except Exception as e:
        print(f"✗ failed: {e}")

# --- Persist catalog if anything changed -------------------------------------
if updated_rows:
    cat.to_csv(catalog_path, index=False)
    print("\nUpdated:", ", ".join(updated_rows))
else:
    print("Everything up to date.")


All dependencies ready.

Skipping GDPC1 – up to date
Skipping A939RX0Q048SBEA – up to date
Skipping M2REAL – up to date
Skipping UNRATE – up to date
Skipping CLVMNACSCAB1GQDE – up to date
Skipping GFDEBTN – up to date
Skipping GFDEGDQ188S – up to date
Skipping TDSP – up to date
Fetching news-us-nyt … ✓ success
Fetching news-world-nyt … ✓ success
Fetching news-africa-nyt … ✓ success
Fetching news-europe-nyt … ✓ success
Fetching news-asia-nyt … ✓ success
Fetching news-americas-nyt … ✓ success
Fetching news-middle-east-nyt … ✓ success
Fetching news-business-nyt … ✓ success
Fetching news-economy-nyt … no change
Fetching news-us-politics-nyt … ✓ success
Fetching news-world-wsj … ✓ success
Fetching news-us-wsj … ✓ success
Fetching news-business-wsj … ✓ success
Fetching news-markets-wsj … ✓ success
Fetching news-economy-wsj … ✓ success
Fetching news-us-politics-wsj … no change
Fetching news-us-politics-wapo … ✗ failed: ('Connection aborted.', RemoteDisconnected('Remote end closed connection w

In [None]:
# This cell updates the markdown index files for all the data sources
from pathlib import Path
import csv
import re

BASE_DIR = Path.cwd() if Path('catalog.csv').exists() else Path.cwd() / 'data'
with open(BASE_DIR / 'catalog.csv', newline='') as f:
    cat = list(csv.DictReader(f))

for row in cat:
    folder = BASE_DIR / row['category'] / row['source'] / row['folder']
    folder.mkdir(parents=True, exist_ok=True)
    filetype = row['filetype'].strip().lstrip('.')
    output_ext = 'json' if filetype.lower() in ('rss', 'xml') else filetype
    desc = row['description'].strip()
    source = row['source'].strip()
    date = row.get('last_fetched', '').strip()

    pattern = re.compile(r"\d{4}-\d{2}-\d{2}(?:-\d{2})?\." + re.escape(output_ext) + r"$")
    dated_files = sorted(p.name for p in folder.iterdir() if pattern.match(p.name))

    lines = [
        '---',
        'layout: default',
        f'title: {source} - {desc}',
        f'date: {date}',
        '---',
        '',
        f'## {source} - {desc}',
        '',
        '<div id="data-chart"></div>',
        '<div id="data-table"></div>',
    ]

    if row['source'] == 'fred' and filetype == 'json':
        lines += [
            '<script>',
            "document.addEventListener('DOMContentLoaded', function(){",
            "  ShowChart($('#data-chart'));",
            "  SourceTabler($('#data-table'));",
            "});",
            '</script>',
        ]
    else:
        lines += [
            '<script>',
            "document.addEventListener('DOMContentLoaded', function(){",
            "  document.getElementById('data-table').textContent = 'This source isn\'t supported for tables yet.';",
            "});",
            '</script>',
        ]

    lines += [
        '',
        '## File Versions:',
    ]
    links = [f'[Latest version](./latest.{output_ext})'] + [f'[{fname}](./{fname})' for fname in dated_files]
    for i, link in enumerate(links, 1):
        lines.append(f'{i}. {link}')
    (folder / 'index.md').write_text("\n".join(lines) + "\n")

print('Index files generated for', ', '.join(r['folder'] for r in cat))


Index files generated for GDPC1, A939RX0Q048SBEA, M2REAL, UNRATE, CLVMNACSCAB1GQDE, GFDEBTN, GFDEGDQ188S, TDSP, news-us-nyt, news-world-nyt, news-africa-nyt, news-europe-nyt, news-asia-nyt, news-americas-nyt, news-middle-east-nyt, news-business-nyt, news-economy-nyt, news-us-politics-nyt, news-world-wsj, news-us-wsj, news-business-wsj, news-markets-wsj, news-economy-wsj, news-us-politics-wsj, news-us-politics-wapo, news-us-wapo, news-world-wapo, news-business-wapo, latimes-business, latimes-us, latimes-us-politics, news-world-chi-tribune, news-business-chi-tribune, news-us-politics-chi-tribune, news-us-business-startribune, news-us-politics-startribune, news-us-nypost, news-world-nypost, news-us-politics-nypost, news-business-nypost, news-world-toi, news-business-toi, news-us-toi, news-middle-east-toi, news-europe-toi, news-world-cbc, news-politics-cbc, news-africa-bbc, news-asia-bbc, news-europe-bbc, news-latin-america-bbc, news-middle-east-bbc, news-us-bbc, news-world-bbc, news-busin

In [None]:

# Update outdated notebooks until none remain
import json, re, time, subprocess, sys
from pathlib import Path

repo_dir = Path.cwd()
if not (repo_dir / 'analysis').is_dir():
    repo_dir = repo_dir.parent
analysis_dir = repo_dir / 'analysis'
data_dir = repo_dir / 'data'

pattern = re.compile(r'[A-Za-z0-9_/.-]*latest\.(?:csv|json|xml|rss)')

def mtime_str(p: Path) -> str:
    return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(p.stat().st_mtime))

def build_dep_map():
    ipynb_paths = sorted(analysis_dir.rglob('*.ipynb'))
    dep_map = {}
    for nb in ipynb_paths:
        text = nb.read_text()
        matches = sorted(set(pattern.findall(text)))
        deps = []
        for m in matches:
            dep = (nb.parent / m).resolve()
            if not dep.exists():
                dep = (repo_dir / m.lstrip('./')).resolve()
            if dep.exists():
                deps.append(dep)
        dep_map[nb] = deps
    return dep_map

def outdated(nb, deps):
    nb_mtime = nb.stat().st_mtime
    return any(d.stat().st_mtime > nb_mtime for d in deps)


def execute(nb: Path):
    import shutil
    rel = str(nb.relative_to(repo_dir))
    if not shutil.which('jupyter'):
        print(f'jupyter not available - skipping {rel}')
        return
    print(f'Running {rel} …')
    cmd=[sys.executable,'-m','jupyter','nbconvert','--to','notebook','--inplace','--execute','--ExecutePreprocessor.timeout=600','--debug',str(nb)]
    print('  Command:', ' '.join(cmd))
    proc=subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
    print(proc.stdout)
    if proc.returncode==0:
        print('  ✓ success')
    else:
        print(f'  ✗ failed with exit code {proc.returncode}')
while True:
    dep_map=build_dep_map()
    outdated_nbs=[nb for nb,deps in dep_map.items() if deps and outdated(nb,deps)]
    report={'outdated_notebooks':[str(nb.relative_to(repo_dir)) for nb in outdated_nbs]}
    deps_path = repo_dir / 'dependencies.json'
    deps_path.write_text(
        json.dumps(report, indent=2) + "\n"
    )
    if not outdated_nbs:
        print('Everything up to date.')
        break
    for nb in outdated_nbs:
        execute(nb)
