# Data Source Dashboard

In [None]:
# ========== Bootstrap: ensure pandas is available =========
import importlib, subprocess, sys

def _ensure(pkg_name):
    try:
        importlib.import_module(pkg_name)
    except ModuleNotFoundError:
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', pkg_name])
    finally:
        globals()[pkg_name] = importlib.import_module(pkg_name)

_ensure('pandas')
from pathlib import Path
import pandas as pd
import json

catalog = pd.read_csv(Path('..','..','data','catalog.csv'))
catalog['path'] = catalog.apply(lambda r: Path('..','..','data', r['category'], r['source'], r['folder']), axis=1)
dashboard = catalog[['path','filetype','last_fetched']].copy()

def count_headlines(row):
    latest = row['path'] / f"latest.{row['filetype']}"
    if not latest.exists():
        return 0
    if row['filetype'] == 'csv':
        return sum(1 for _ in open(latest)) - 1
    elif row['filetype'] == 'json':
        with open(latest) as f:
            data = json.load(f)
        if isinstance(data, list):
            return len(data)
        if 'entries' in data:
            return len(data['entries'])
        if 'observations' in data:
            return len(data['observations'])
    return 0

dashboard['headline_count'] = dashboard.apply(count_headlines, axis=1)
dashboard = dashboard.sort_values('path').reset_index(drop=True)
dashboard


In [None]:
from datetime import datetime
timestamp = datetime.utcnow().strftime('%Y-%m-%d-%H-00')
dashboard.to_csv(f'{timestamp}.csv', index=False)
dashboard.to_csv('latest.csv', index=False)


In [None]:
print(f'Total headlines across feeds: {dashboard.headline_count.sum()}')
