# Update Headlines

In [1]:
# ========== Bootstrap: ensure required Python packages are present =========
import importlib, subprocess, sys
from typing import Optional

def _ensure(pkg_name: str, import_name: Optional[str] = None):
    try:
        importlib.import_module(import_name or pkg_name)
    except ModuleNotFoundError:
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', pkg_name])
    finally:
        globals()[import_name or pkg_name] = importlib.import_module(import_name or pkg_name)

print('No external dependencies required.\n')


No external dependencies required.



In [2]:
from pathlib import Path
import csv
import json
import xml.etree.ElementTree as ET
from datetime import datetime, timezone, timedelta
from email.utils import parsedate_to_datetime
import shutil

BASE_DIR = Path.cwd()
REPO_DIR = BASE_DIR
while not ((REPO_DIR / 'data').exists() and (REPO_DIR / 'analysis').exists()):
    if REPO_DIR.parent == REPO_DIR:
        raise FileNotFoundError('Repository root not found')
    REPO_DIR = REPO_DIR.parent
DATA_DIR = REPO_DIR / 'data'
HEADLINES_DIR = REPO_DIR / 'analysis/headlines'
HEADLINES_DIR.mkdir(parents=True, exist_ok=True)
ARCHIVE_DIR = HEADLINES_DIR / 'archive'
ARCHIVE_DIR.mkdir(exist_ok=True)

def parse_pubdate(date_str):
    """Return a timezone-aware datetime parsed from a feed string."""
    try:
        dt = parsedate_to_datetime(date_str) if date_str else None
        if dt is None:
            return None
        if dt.tzinfo is None:
            dt = dt.replace(tzinfo=timezone.utc)
        return dt.astimezone(timezone.utc)
    except Exception:
        return None

def format_pubdate(dt):
    """Format a publication datetime for CSV output."""
    return dt.strftime('%Y-%m-%d-%H-%M-%S +0000') if dt else ''

def parse_feed(path: Path):
    """Parse an RSS or JSON feed file into a list of entries."""
    entries = []
    if path.suffix == '.json':
        with open(path, 'r', encoding='utf-8') as f:
            try:
                data = json.load(f)
            except Exception:
                return entries
        for item in data.get('entries', []):
            title = item.get('title')
            link = item.get('link')
            pub = parse_pubdate(item.get('published'))
            if title and link:
                entries.append((pub, title.strip(), link.strip()))
    else:
        try:
            tree = ET.parse(path)
            root = tree.getroot()
        except ET.ParseError:
            return entries
        for item in root.iter():
            if item.tag.lower().endswith(('item', 'entry')):
                title = None
                link = None
                pub = None
                for child in item:
                    tag = child.tag.lower()
                    if tag.endswith('title'):
                        title = (child.text or '').strip()
                    if tag.endswith('link'):
                        link = (child.text or '').strip() or child.attrib.get('href')
                    if tag.endswith(('pubdate', 'published', 'updated')):
                        pub = parse_pubdate((child.text or '').strip())
                if title and link:
                    entries.append((pub, title, link))
    return entries

def collect_headlines():
    """Gather headlines from all news source files."""
    all_entries = []
    feed_info = {}
    for latest_file in DATA_DIR.rglob('latest.*'):
        if latest_file.suffix not in {'.json', '.rss', '.xml'}:
            continue
        try:
            source_name = latest_file.relative_to(DATA_DIR).parts[2]
        except Exception:
            continue
        feed_entries = parse_feed(latest_file)
        if feed_entries:
            recent = max((pub for pub, _t, _l in feed_entries if pub), default=None)
            feed_info[source_name] = {'count': len(feed_entries), 'recent': recent}
        for pub, title, link in feed_entries:
            all_entries.append((pub, title, link, source_name))
    return all_entries, feed_info

def _date_key(date_str):
    try:
        return parsedate_to_datetime(date_str) if date_str else datetime.min
    except Exception:
        return datetime.min

def deduplicate_entries(entries):
    """Remove duplicate headlines by title or link."""
    deduped = []
    seen_titles = set()
    seen_links = set()
    for pub, title, link, src in entries:
        t_key = title.lower()
        l_key = link.lower()
        if t_key in seen_titles or l_key in seen_links:
            continue
        deduped.append((pub, src, title, link))
        seen_titles.add(t_key)
        seen_links.add(l_key)
    cutoff = datetime.now(timezone.utc) - timedelta(days=1)
    return [r for r in deduped if r[0] and r[0] >= cutoff]

def write_csv(rows, path):
    """Write headline rows to a CSV file."""
    with open(path, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['pubdate', 'source', 'title', 'link'])
        for pub, src, title, link in rows:
            writer.writerow([format_pubdate(pub), src, title, link])

def print_summary(feed_info):
    """Display a summary of feed counts."""
    print('Feed summary:')
    print(f"{'source':<20} {'count':>5}  {'most recent'}")
    for src, info in sorted(feed_info.items()):
        print(f"{src:<20} {info['count']:5}  {format_pubdate(info['recent'])}")

def update_headlines():
    """Update headline archive from the most recent feeds."""
    timestamp = datetime.utcnow().strftime('%Y-%m-%d-%H-00')
    hourly_file = ARCHIVE_DIR / f"{timestamp}.csv"
    if hourly_file.exists():
        print(f"{hourly_file.name} already exists. Skipping update.")
        return
    entries, feed_info = collect_headlines()
    entries.sort(
        key=lambda r: r[0] or datetime.min.replace(tzinfo=timezone.utc),
        reverse=True,
    )
    deduped = deduplicate_entries(entries)
    write_csv(deduped, hourly_file)
    shutil.copy(hourly_file, HEADLINES_DIR / 'latest.csv')
    print(f"Wrote {hourly_file} and updated latest.csv")
    print()
    print_summary(feed_info)

update_headlines()

Wrote /home/runner/work/Analysis/Analysis/analysis/headlines/archive/2025-09-24-04-00.csv and updated latest.csv

Feed summary:
source               count  most recent
48hills                 10  2025-09-24-02-34-46 +0000
bbc                     29  2025-09-23-15-09-42 +0000
cbc                     19  2025-09-24-00-06-04 +0000
dw                      14  
eltecolote              10  2025-09-21-04-21-42 +0000
latimes                 98  2025-09-24-00-28-02 +0000
missionlocal            31  2025-09-23-22-04-09 +0000
nypost                  20  2025-09-24-03-35-14 +0000
nyt                     20  2025-09-23-21-45-12 +0000
startribune              8  2025-09-24-03-38-49 +0000
toi                     20  
wapo                     7  2025-09-23-21-29-49 +0000
wsj                     45  2025-09-24-03-39-00 +0000


  timestamp = datetime.utcnow().strftime('%Y-%m-%d-%H-00')
