# Update Script
This notebook orchestrates data downloads and analysis refreshes.

In [1]:
# ========== Bootstrap: ensure required Python packages are present ==========
import importlib, subprocess, sys

def _ensure(pkg_name: str, import_name: str | None = None):
    """
    Import `import_name` (defaults to `pkg_name`); if that fails, pip‑install.
    """
    try:
        importlib.import_module(import_name or pkg_name)
    except ModuleNotFoundError:
        print(f"Package '{pkg_name}' not found — installing …")
        subprocess.check_call([sys.executable, "-m", "pip", "install", pkg_name])
    finally:
        globals()[import_name or pkg_name] = importlib.import_module(import_name or pkg_name)

# --- Required third‑party libraries ------------------------------------------
_ensure("pandas")
_ensure("requests")
_ensure("feedparser")
_ensure("textblob")
print("All dependencies ready.\n")

# --- Standard imports --------------------------------------------------------
from pathlib import Path
import datetime as dt
import os, re, shutil, json
import pandas as pd, requests, urllib.parse

# --- Helper: replace [date %Y-%m-%d] tokens -----------------------------------
def substitute_date_tokens(url: str) -> str:
    def _replace(m):
        fmt = m.group(1).strip()
        return dt.date.today().strftime(fmt)
    return re.sub(r"\[date\s+([^\]]+)\]", _replace, url)

# --- Helper: append API key if specified -----------------------------------
def add_apikey(url: str, env_var: str | None) -> str:
    if env_var:
        key = os.getenv(env_var)
        if key:
            sep = '&' if '?' in url else '?'
            return f'{url}{sep}api_key={urllib.parse.quote_plus(key)}'
        else:
            print(f"Warning: environment variable '{env_var}' not set.")
    return url

# --- Cadence map (word → minimum days between fetches) ------------------------
CADENCE_DAYS = {
    "daily": 1,
    "weekly": 7,
    "monthly": 30,
    "quarterly": 90,
}

# --- Resolve base directory so notebook works from repo root or data folder ---
BASE_DIR = Path.cwd() if Path('catalog.csv').exists() else Path.cwd() / 'data'

# --- Load catalog -------------------------------------------------------------
catalog_path = BASE_DIR / 'catalog.csv'
cat = pd.read_csv(catalog_path)
cat['filetype'] = cat['filetype'].astype(str).str.strip().str.lstrip('.')

today = dt.date.today()
updated_rows = []                # remember which rows we refresh

for idx, row in cat.iterrows():
    folder = BASE_DIR / str(row['category']) / str(row['source']) / str(row['folder'])
    folder.mkdir(parents=True, exist_ok=True)
    filetype = str(row['filetype']).strip().lstrip('.')
    output_ext = 'json' if filetype.lower() in ('rss', 'xml') else filetype
    latest_fp = folder / f'latest.{output_ext}'
    dated_fp = folder / f'{today:%Y-%m-%d}.{output_ext}'
    if dated_fp.exists():
        cat.at[idx, 'last_fetched'] = today.isoformat()
        continue
    last_fetched = (
        pd.to_datetime(row["last_fetched"]).date()
        if pd.notna(row["last_fetched"]) else None
    )

    # ---- Determine if an update is due --------------------------------------
    cadence = str(row["cadence"]).lower().strip()
    min_age = CADENCE_DAYS.get(cadence, 30)        # default 30 days
    needs_update = (
        (not latest_fp.exists()) or
        (not last_fetched) or
        (today - last_fetched).days >= min_age
    )

    if not needs_update:
        print(f"Skipping {row['folder']} – up to date")
        continue

    # ---- Build the request URL ---------------------------------------------
    url = substitute_date_tokens(str(row["url"]))
    url = add_apikey(url, str(row.get('api_key') or '').strip() or None)

    print(f"Fetching {row['folder']} …", end=" ")
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        if filetype.lower() in ('rss', 'xml'):
            feed = feedparser.parse(r.content)
            entries = []
            for e in feed.entries:
                text = ' '.join(filter(None, [e.get('title'), e.get('summary')]))
                polarity = textblob.TextBlob(text).sentiment.polarity
                entries.append({'title': e.get('title'), 'link': e.get('link'),
                               'published': e.get('published'),
                               'sentiment': polarity})
            content_bytes = json.dumps({'entries': entries}, ensure_ascii=False, indent=2).encode('utf-8')
        else:
            content_bytes = r.content
        if filetype.lower() == 'json':
            try:
                data_json = r.json()
            except Exception:
                data_json = None
            if isinstance(data_json, dict) and data_json.get('error_message'):
                raise ValueError(data_json['error_message'])
        # ---- Save snapshot and latest --------------------------------------
        if latest_fp.exists() and latest_fp.read_bytes() == content_bytes:
            cat.at[idx, 'last_fetched'] = today.isoformat()
            print('no change')
            continue
        dated_fp.write_bytes(content_bytes)
        shutil.copyfile(dated_fp, latest_fp)

        # ---- Mark success in catalog ---------------------------------------
        cat.at[idx, "last_fetched"] = today.isoformat()
        updated_rows.append(row["folder"])
        print("✓ success")
    except Exception as e:
        print(f"✗ failed: {e}")

# --- Persist catalog if anything changed -------------------------------------
if updated_rows:
    cat.to_csv(catalog_path, index=False)
    print("\nUpdated:", ", ".join(updated_rows))
else:
    print("Everything up to date.")


All dependencies ready.

Skipping GDPC1 – up to date
Skipping A939RX0Q048SBEA – up to date
Skipping M2REAL – up to date
Skipping UNRATE – up to date
Skipping CLVMNACSCAB1GQDE – up to date
Skipping GFDEBTN – up to date
Skipping GFDEGDQ188S – up to date
Skipping TDSP – up to date
Fetching news-us-nyt … ✓ success
Fetching news-world-nyt … ✓ success
Fetching news-africa-nyt … ✓ success
Fetching news-europe-nyt … ✓ success
Fetching news-asia-nyt … ✓ success
Fetching news-americas-nyt … ✓ success
Fetching news-middle-east-nyt … ✓ success
Fetching news-business-nyt … ✓ success
Fetching news-economy-nyt … ✓ success
Fetching news-us-politics-nyt … ✓ success
Fetching news-world-wsj … ✓ success
Fetching news-us-wsj … ✓ success
Fetching news-business-wsj … ✓ success
Fetching news-markets-wsj … ✓ success
Fetching news-economy-wsj … ✓ success
Fetching news-us-politics-wsj … ✓ success
Fetching news-us-politics-wapo … ✗ failed: HTTPSConnectionPool(host='www.washingtonpost.com', port=443): Read timed o

In [3]:
# This cell updates the markdown index files for all the data sources
from pathlib import Path
import csv
import re

BASE_DIR = Path.cwd() if Path('catalog.csv').exists() else Path.cwd() / 'data'
with open(BASE_DIR / 'catalog.csv', newline='') as f:
    cat = list(csv.DictReader(f))

for row in cat:
    folder = BASE_DIR / row['category'] / row['source'] / row['folder']
    folder.mkdir(parents=True, exist_ok=True)
    filetype = row['filetype'].strip().lstrip('.')
    output_ext = 'json' if filetype.lower() in ('rss', 'xml') else filetype
    desc = row['description'].strip()
    source = row['source'].strip()
    date = row.get('last_fetched', '').strip()

    pattern = re.compile(r'\d{4}-\d{2}-\d{2}\.' + re.escape(output_ext) + r'$')
    dated_files = sorted(p.name for p in folder.iterdir() if pattern.match(p.name))

    lines = [
        '---',
        'layout: default',
        f'title: {source} - {desc}',
        f'date: {date}',
        '---',
        '',
        f'## {source} - {desc}',
        '',
        '<div id="data-chart"></div>',
        '<div id="data-table"></div>',
    ]

    if row['source'] == 'fred' and filetype == 'json':
        lines += [
            '<script>',
            "document.addEventListener('DOMContentLoaded', function(){",
            "  ShowChart($('#data-chart'));",
            "  SourceTabler($('#data-table'));",
            "});",
            '</script>',
        ]
    else:
        lines += [
            '<script>',
            "document.addEventListener('DOMContentLoaded', function(){",
            "  document.getElementById('data-table').textContent = 'This source isn\'t supported for tables yet.';",
            "});",
            '</script>',
        ]

    lines += [
        '',
        '## File Versions:',
    ]
    links = [f'[Latest version](./latest.{output_ext})'] + [f'[{fname}](./{fname})' for fname in dated_files]
    for i, link in enumerate(links, 1):
        lines.append(f'{i}. {link}')
    (folder / 'index.md').write_text("\n".join(lines) + "\n")

print('Index files generated for', ', '.join(r['folder'] for r in cat))


Index files generated for GDPC1, A939RX0Q048SBEA, M2REAL, UNRATE, CLVMNACSCAB1GQDE, GFDEBTN, GFDEGDQ188S, TDSP, news-us-nyt, news-world-nyt, news-africa-nyt, news-europe-nyt, news-asia-nyt, news-americas-nyt, news-middle-east-nyt, news-business-nyt, news-economy-nyt, news-us-politics-nyt, news-world-wsj, news-us-wsj, news-business-wsj, news-markets-wsj, news-economy-wsj, news-us-politics-wsj, news-us-politics-wapo, news-us-wapo, news-world-wapo, news-business-wapo, latimes-business, latimes-us, latimes-us-politics, news-world-chi-tribune, news-business-chi-tribune, news-us-politics-chi-tribune


In [None]:
# Dependency reportfrom pathlib import Pathimport json, re, timerepo_dir = Path.cwd()if not (repo_dir / 'analysis').is_dir():    repo_dir = repo_dir.parentanalysis_dir = repo_dir / 'analysis'data_dir = repo_dir / 'data'pattern = re.compile(r'[A-Za-z0-9_/.-]*latest\.(?:csv|json|xml|rss)')def mtime_str(p: Path) -> str:    return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(p.stat().st_mtime))# --- List notebooks ---ipynb_paths = sorted(analysis_dir.rglob('*.ipynb'))notebooks = [[str(p.relative_to(repo_dir)), mtime_str(p)] for p in ipynb_paths]# --- List latest files ---latest_files = sorted(data_dir.rglob('latest.*')) + sorted(analysis_dir.rglob('latest.*'))latest = [[str(p.relative_to(repo_dir)), mtime_str(p)] for p in latest_files]# --- Map notebook dependencies ---dep_map = {}for nb in ipynb_paths:    text = nb.read_text()    matches = sorted(set(pattern.findall(text)))    deps = []    for m in matches:        dep = (nb.parent / m).resolve()        if not dep.exists():            dep = (repo_dir / m.lstrip('./')).resolve()        if dep.exists():            deps.append([str(dep.relative_to(repo_dir)), mtime_str(dep)])    dep_map[str(nb.relative_to(repo_dir))] = {'modified': mtime_str(nb), 'deps': deps}dep_list = [[nb, info['modified'], info['deps']] for nb, info in dep_map.items()]only_data = []other = []for nb, info in dep_map.items():    if info['deps'] and all(d[0].startswith('data/') for d in info['deps']):        only_data.append([nb, info['modified']])    else:        other.append([nb, info['modified']])report = {
    'notebooks': notebooks,
    'latest_files': latest,
    'dependencies': dep_list,
    'only_data_notebooks': only_data,
    'other_notebooks': other,
}
json_txt = json.dumps(report, indent=2)
print(json_txt)
(repo_dir / 'dependencies.json').write_text(json_txt + '
')
(repo_dir / f"dependencies-{time.strftime('%Y-%m-%d')}.json").write_text(json_txt + '
')


In [None]:
# Execute outdated notebooks
import subprocess, sys

def outdated(nb, deps):
    nb_mtime = (repo_dir / nb).stat().st_mtime
    for dep_path, _ in deps:
        dep_mtime = (repo_dir / dep_path).stat().st_mtime
        if dep_mtime > nb_mtime:
            return True
    return False

def execute(nb):
    path = repo_dir / nb
    print(f'Running {nb} …', end=' ')
    try:
        subprocess.check_call([
            sys.executable, '-m', 'jupyter', 'nbconvert',
            '--to', 'notebook', '--inplace', '--execute', str(path)
        ])
        print('✓ success')
    except Exception as e:
        print(f'✗ failed: {e}')

for nb, _ in only_data:
    if outdated(nb, dep_map[nb]['deps']):
        execute(nb)

for nb, _ in other:
    if outdated(nb, dep_map[nb]['deps']):
        execute(nb)

