# Update Script
This notebook updates the top.json file in the same directory as the notebook from https://analysis.castromedia.org/analysis/news-topics/top.json and then builds an index.md file from the updated top.json file.

Example json file;

```
[
  {
    "score":44,
    "pubdate":"2025-07-08-13-59-19 +0000",
    "source":"nyt",
    "title":"Trump\u2019s New Trade Threats Set Off Global Scramble to Avoid Tariffs",
    "link":"https:\/\/www.nytimes.com\/2025\/07\/08\/business\/economy\/trump-tariffs-talks.html"
  },
  {
    "score":26,
    "pubdate":"2025-07-07-22-06-53 +0000",
    "source":"nypost",
    "title":"White House urges Dems to \u2018tone down\u2019 anti-ICE rhetoric \u2014 as activists say lawmakers should be willing to \u2018get shot\u2019 to obstruct enforcement",
    "link":"https:\/\/nypost.com\/2025\/07\/07\/us-news\/white-house-urges-dems-to-tone-down-anti-ice-rhetoric-as-activists-say-pols-should-be-willing-to-get-shot\/"
  },
  {
    "score":22,
    "pubdate":"2025-07-07-23-24-42 +0000",
    "source":"nypost",
    "title":"EPA chief Lee Zeldin unveils Trump admin plan to give jolt to nuclear power plants, zap wind power",
    "link":"https:\/\/nypost.com\/2025\/07\/07\/us-news\/epa-chief-unveils-trump-admin-plan-to-give-jolt-to-nuclear-power-plants-zap-wind-power\/"
  },
  {
    "score":21,
    "pubdate":"2025-07-08-14-47-33 +0000",
    "source":"nypost",
    "title":"AI-powered Marco Rubio impersonator makes calls to foreign ministers, top officials: report",
    "link":"https:\/\/nypost.com\/2025\/07\/08\/us-news\/ai-powered-rubio-impersonator-makes-calls-to-foreign-ministers-other-top-officials-report\/"
  },
  {
    "score":21,
    "pubdate":"2025-07-08-16-43-22 +0000",
    "source":"nypost",
    "title":"Trump rips Putin during cabinet meeting for talking \u2018bulls\u2014\u2018: \u2018He\u2019s very nice, but it turns out to be meaningless\u2019",
    "link":"https:\/\/nypost.com\/2025\/07\/08\/us-news\/trump-rips-putin-during-cabinet-meeting-for-talking-bulls-hes-very-nice-but-it-turns-out-to-be-meaningless\/"
  },
  {
    "score":20,
    "pubdate":"2025-07-08-03-07-20 +0000",
    "source":"nypost",
    "title":"Gov. Ron DeSantis warns Elon Musk about running \u2018America Party\u2019 candidates against GOP: \u2018Democrats would win\u2019",
    "link":"https:\/\/nypost.com\/2025\/07\/07\/us-news\/desantis-warns-musk-about-running-america-party-candidates-against-gop\/"
  },
  {
    "score":18,
    "pubdate":"2025-07-07-21-12-29 +0000",
    "source":"nypost",
    "title":"Sen. Ted Cruz cuts short European vacation due to deadly Texas flooding",
    "link":"https:\/\/nypost.com\/2025\/07\/07\/us-news\/ted-cruz-cuts-short-european-vacation-due-to-deadly-texas-flooding\/"
  },
  {
    "score":17,
    "pubdate":"2025-07-08-04-00-11 +0000",
    "source":"nyt",
    "title":"Trump and Netanyahu Meet Amid Gaza Cease-Fire Negotiations",
    "link":"https:\/\/www.nytimes.com\/2025\/07\/07\/us\/politics\/trump-netanyahu-dinner-gaza-cease-fire.html"
  },
  {
    "score":16,
    "pubdate":"2025-07-07-21-23-28 +0000",
    "source":"nypost",
    "title":"Florida business group woos NYC CEOs worried about Zohran Mamdani tax plans",
    "link":"https:\/\/nypost.com\/2025\/07\/07\/us-news\/florida-business-group-woos-nyc-ceos-worried-about-zohran-mamdani-tax-plans\/"
  },
  {
    "score":14,
    "pubdate":"2025-07-08-04-34-03 +0000",
    "source":"nypost",
    "title":"Kamala Harris campaign staffers warned not to reach out to Taylor Swift: \u2018Doug Emhoff was handling it\u2019",
    "link":"https:\/\/nypost.com\/2025\/07\/08\/us-news\/kamala-harris-campaign-staffers-warned-not-to-reach-out-to-taylor-swift-doug-emhoff-was-handling-it\/"
  }
]

```

In [5]:
# ========== Bootstrap: ensure required Python packages are present ==========
import importlib, subprocess, sys
from typing import Optional


def _ensure(pkg_name: str, import_name: Optional[str] = None, required: bool = True):
    """Import a module, installing it if necessary. If installation fails and
    the package is required, the exception is raised. Optional packages may
    remain unavailable."""
    try:
        return importlib.import_module(import_name or pkg_name)
    except ModuleNotFoundError:
        print(f"Package '{pkg_name}' not found - installing ...")
        try:
            subprocess.check_call([sys.executable, '-m', 'pip', 'install', pkg_name])
        except Exception as e:
            print(f"Failed to install {pkg_name}: {e}")
            if required:
                raise
    try:
        mod = importlib.import_module(import_name or pkg_name)
        globals()[import_name or pkg_name] = mod
        return mod
    except ModuleNotFoundError:
        if required:
            raise
        print(f"Package '{pkg_name}' is unavailable.")
        globals()[import_name or pkg_name] = None
        return None
# --- Required third-party libraries ------------------------------------------
_ensure("pandas")
_ensure("requests")
_ensure("feedparser")
_ensure("textblob")
_ensure("jupyter", required=False)
_ensure("nbconvert", required=False)
print("All dependencies ready.\n")

# --- Standard imports --------------------------------------------------------
from pathlib import Path
import datetime as dt
import os, re, shutil, json, feedparser, textblob
import pandas as pd, requests, urllib.parse

# --- Helper: replace [date %Y-%m-%d] tokens -----------------------------------
def substitute_date_tokens(url: str) -> str:
    def _replace(m):
        fmt = m.group(1).strip()
        return dt.date.today().strftime(fmt)
    return re.sub(r"\[date\s+([^\]]+)\]", _replace, url)

# --- Helper: append API key if specified -----------------------------------
def add_apikey(url: str, env_var: Optional[str]) -> str:
    if env_var and str(env_var).lower() != "nan":
        key = os.getenv(env_var)
        if key:
            sep = '&' if '?' in url else '?'
            return f'{url}{sep}api_key={urllib.parse.quote_plus(key)}'
        else:
            print(f"Warning: environment variable '{env_var}' not set.")
    return url

# --- Cadence map (word → minimum seconds between fetches) ------------------------
CADENCE_SECONDS = {
    "hourly": 3600,
    "daily": 86400,
    "weekly": 604800,
    "monthly": 2592000,
    "quarterly": 7776000,
}

# --- Resolve base directory so notebook works from repo root or data folder ---
BASE_DIR = Path.cwd() if Path('catalog.csv').exists() else Path.cwd() / 'data'


    

All dependencies ready.



## Create the index.md file

In [6]:
import json
import datetime as dt
from pathlib import Path
import requests

TOP_URL='https://analysis.castromedia.org/analysis/news-topics/top.json'

# Fetch latest top stories JSON
resp=requests.get(TOP_URL)
resp.raise_for_status()
data=resp.json()

def to_entities(obj):
    if isinstance(obj, str):
        return obj.encode('ascii', 'xmlcharrefreplace').decode('ascii')
    if isinstance(obj, list):
        return [to_entities(x) for x in obj]
    if isinstance(obj, dict):
        return {k: to_entities(v) for k, v in obj.items()}
    return obj

data_html = to_entities(data)

# Write top.json to repository root
Path('top.json').write_text(json.dumps(data_html, indent=2, ensure_ascii=False), encoding='utf-8')
print(f"Fetched {len(data_html)} records")

# Build index.md from the data
lines=[
    '---\n',
    'layout: default\n',
    'title: World News\n',
    '---\n\n',
    f'#### Updated <span class=\"datetime\">{dt.datetime.utcnow().strftime("%Y-%m-%d %H:%M UTC")}</span>\n\n',
    '<div markdown="0">\n<ul>\n'

]
for item in data_html:
    lines.append(f"<li><a href='{item['link']}'>{item['title']}</a><div class='byline small text-muted'>{item['source']}, <span class=\"datetime\">{item['pubdate']}</span></div></li>\n")
lines.append('</ul>\n</div>\n')
Path('index.md').write_text(''.join(lines), encoding='utf-8')
print('index.md updated')
archive_dir = Path('archive')
archive_dir.mkdir(exist_ok=True)
ts = dt.datetime.utcnow().strftime('%Y-%m-%d-%H-00-00')
archive_file = archive_dir / f"{ts}.md"
archive_file.write_text(''.join(lines), encoding='utf-8')
print(f'Archive written to {archive_file}')
archive_files = sorted(p for p in archive_dir.glob('*.md') if p.name != 'index.md')
idx_lines = ['---\n','layout: default\n','title: News Archive\n','---\n\n']
for p in archive_files:
    idx_lines.append(f'- [{p.stem}]({p.name})\n')
(archive_dir / 'index.md').write_text(''.join(idx_lines), encoding='utf-8')
print('archive index updated')


Fetched 10 records
index.md updated
Archive written to archive\2025-07-08-20-00-00.md
archive index updated


  f'#### Updated <span class=\"datetime\">{dt.datetime.utcnow().strftime("%Y-%m-%d %H:%M UTC")}</span>\n\n',
  ts = dt.datetime.utcnow().strftime('%Y-%m-%d-%H-00-00')
