# Run Script

This notebook fetches news data and builds the site.

## Setup

In [27]:
import importlib
import subprocess
import sys
from pathlib import Path
import json
import datetime as dt

def ensure(pkg):
    try:
        return importlib.import_module(pkg)
    except ModuleNotFoundError:
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', pkg])
        return importlib.import_module(pkg)

requests = ensure('requests')

## Fetch latest data

In [28]:
def fetch_json(url):
    resp = requests.get(url)
    resp.raise_for_status()
    return resp.json()

def to_entities(obj):
    if isinstance(obj, str):
        return obj.encode('ascii', 'xmlcharrefreplace').decode('ascii')
    if isinstance(obj, list):
        return [to_entities(x) for x in obj]
    if isinstance(obj, dict):
        return {k: to_entities(v) for k, v in obj.items()}
    return obj

TOP_URL = 'https://analysis.castromedia.org/analysis/news-topics/top.json'
ANALYSIS_URLS = {
    '1h.json': 'https://analysis.castromedia.org/analysis/headline_analysis/1h.json',
    '24h.json': 'https://analysis.castromedia.org/analysis/headline_analysis/24h.json',
    '7d.json': 'https://analysis.castromedia.org/analysis/headline_analysis/7d.json',
}

Path('top.json').write_text(json.dumps(to_entities(fetch_json(TOP_URL)), indent=2, ensure_ascii=False), encoding='utf-8')
for name, url in ANALYSIS_URLS.items():
    data = fetch_json(url)
    Path(name).write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding='utf-8')

## Build HTML lists

In [29]:
def load_sorted(path):
    text = Path(path).read_text(encoding='utf-8')
    items = json.loads(text)
    items.sort(key=lambda x: dt.datetime.strptime(x['pubdate'], '%Y-%m-%d-%H-%M-%S %z'), reverse=True)
    return [to_entities(i) for i in items]

def build_list(data, limit=None):
    rows = []
    for item in data[:limit] if limit else data:
        title = item.get('title') or item.get('headline', 'Untitled')
        row = (f"""
            <li><a href="{item['link']}">{title}</a>
            <div class='byline small text-muted'>
            {item['source']}, 
            <span class="datetime">{item['pubdate']}</span></div>
            </li>
        """)
        rows.append(row)
    return '\n'.join(rows)

all1h_html = build_list(load_sorted('1h.json'))
all24h_html = build_list(load_sorted('24h.json'))
all7d_html = build_list(load_sorted('7d.json'))

top1h_html = "" #build_list(load_sorted('1h.json'), limit=10)
top24h_html = "" #build_list(json.loads(Path('top.json').read_text(encoding='utf-8')), limit=10)
top7d_html = "" #build_list(load_sorted('7d.json'), limit=10)

## Compile and save pages

In [30]:
def fill(template, mapping):
    for key, value in mapping.items():
        template = template.replace(f'{{{key}}}', value)
    return template

def write_page(content):
    Path('index.md').write_text(content, encoding='utf-8')
    archive_dir = Path('archive')
    archive_dir.mkdir(exist_ok=True)
    ts = dt.datetime.utcnow().strftime('%Y-%m-%d-%H-00-00')
    archive_path = archive_dir / f'{ts}.md'
    archive_path.write_text(content, encoding='utf-8')
    files = sorted(p for p in archive_dir.glob('*.md') if p.name != 'index.md')
    lines = ['---\n','layout: default\n','title: News Archive\n','---\n']
    for p in files:
        lines.append(f'- [{p.stem}]({p.name})\n')
    (archive_dir/'index.md').write_text(''.join(lines), encoding='utf-8')
    print(f'Archive written to {archive_path}')

updated = dt.datetime.utcnow().strftime('%Y-%m-%d %H:%M UTC')
template = Path('template.md').read_text(encoding='utf-8')
page = fill(template, {
    'TIME': updated,
    'TOP1H': top1h_html,
    'TOP24H': top24h_html,
    'TOP7D': top7d_html,
    'ALL1H': all1h_html,
    'ALL24H': all24h_html,
    'ALL7D': all7d_html,
})
write_page(page)

Archive written to archive\2025-07-24-00-00-00.md
