# Data Source Dashboard

In [1]:
# ========== Bootstrap: ensure pandas is available =========
import importlib, subprocess, sys

def _ensure(pkg_name):
    try:
        importlib.import_module(pkg_name)
    except ModuleNotFoundError:
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', pkg_name])
    finally:
        globals()[pkg_name] = importlib.import_module(pkg_name)

_ensure('pandas')
from pathlib import Path
import pandas as pd
import json
from datetime import datetime, timezone, timedelta
from email.utils import parsedate_to_datetime

catalog = pd.read_csv(Path('..','..','data','catalog.csv'))
catalog['path'] = catalog.apply(lambda r: Path('..','..','data', r['category'], r['source'], r['folder']), axis=1)
dashboard = catalog[['path','filetype','last_fetched']].copy()

def count_headlines(row):
    ftype = str(row['filetype']).lower().strip()
    output_ext = 'json' if ftype in ('rss', 'xml') else ftype
    latest = row['path'] / f"latest.{output_ext}"
    if not latest.exists():
        return 0
    if output_ext == 'csv':
        return sum(1 for _ in open(latest, encoding="utf-8")) - 1
    elif output_ext == 'json':
        with open(latest, encoding="utf-8") as f:
            data = json.load(f)
        if isinstance(data, list):
            return len(data)
        if 'entries' in data:
            return len(data['entries'])
        if 'observations' in data:
            return len(data['observations'])
    return 0

def count_recent(row):
    ftype = str(row['filetype']).lower().strip()
    if ftype not in ('rss', 'xml'):
        return 0
    latest = row['path'] / 'latest.json'
    if not latest.exists():
        return 0
    with open(latest, encoding='utf-8') as f:
        data = json.load(f)
    entries = data.get('entries', []) if isinstance(data, dict) else data
    cutoff = datetime.now(timezone.utc) - timedelta(days=1)
    count = 0
    for item in entries:
        pub = item.get('published')
        if not pub:
            continue
        try:
            dt = parsedate_to_datetime(pub)
            if dt.tzinfo is None:
                dt = dt.replace(tzinfo=timezone.utc)
            dt = dt.astimezone(timezone.utc)
            if dt >= cutoff:
                count += 1
        except Exception:
            continue
    return count

dashboard['headline_count'] = dashboard.apply(count_headlines, axis=1)
dashboard['last_24h_count'] = dashboard.apply(count_recent, axis=1)
dashboard = dashboard.sort_values('path').reset_index(drop=True)
dashboard


Unnamed: 0,path,filetype,last_fetched,headline_count
0,../../data/demographics/census/zip-demo-ca,csv,2025-06-18,0
1,../../data/economics/fred/A939RX0Q048SBEA,json,2025-07-07T16:29,313
2,../../data/economics/fred/CLVMNACSCAB1GQDE,json,2025-07-07T16:29,137
3,../../data/economics/fred/GDPC1,json,2025-07-07T16:29,313
4,../../data/economics/fred/GFDEBTN,json,2025-07-07T16:29,237
...,...,...,...,...
57,../../data/news-world/nypost/news-world-nypost,rss,2025-07-16T04:06,20
58,../../data/news-world/nyt/news-world-nyt,rss,2025-07-16T04:06,60
59,../../data/news-world/toi/news-world-toi,rss,2025-07-16T04:06,20
60,../../data/news-world/wapo/news-world-wapo,rss,2025-07-16T03:20,20


In [2]:
from datetime import datetime
timestamp = datetime.utcnow().strftime('%Y-%m-%d-%H-00')
dashboard.to_csv(f'{timestamp}.csv', index=False)
dashboard.to_csv('latest.csv', index=False)


  timestamp = datetime.utcnow().strftime('%Y-%m-%d-%H-00')


In [3]:
print(f'Total headlines across feeds: {dashboard.headline_count.sum()}')
print(f'Headlines in last 24h: {dashboard.last_24h_count.sum()}')


Total headlines across feeds: 4913


In [4]:
from datetime import datetime
from pathlib import Path
from string import Template

timestamp = datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')

template_str = """---
layout: default
title: Data Source Dashboard
date: $date
---

## Data Source Dashboard

A summary of all data sources and their current headline counts.

<p>Last updated: <strong>$date</strong></p>

<div id=\"dashboard-table\"></div>
<script>
function loadCsvTable(sel, csvPath){
  fetch(csvPath)
    .then(r => r.text())
    .then(text => {
      const rows = csvToObjects(text);
      const table = ArrTabler(rows);
      $(sel).html(table);
      new DataTable(sel + ' table', {
        order: [[0, 'desc']],
        columnDefs: [
          { targets: '_all', className: 'dt-head-left dt-body-left' }
        ]
      });
    })
    .catch(() => {
      $(sel).text('Unable to load data.');
    });
}

document.addEventListener('DOMContentLoaded', function(){
  loadCsvTable('#dashboard-table', './latest.csv');
});
</script>

## File Versions:
{% assign csv_files = site.static_files | where:"extname", ".csv" | where_exp:"f","f.path contains 'analysis/dashboard/'" | sort: 'name' | reverse %}
<ol>
  <li><a href=\"./latest.csv\">Latest version</a></li>
  {% for file in csv_files %}
    {% unless file.name == 'latest.csv' %}
  <li><a href=\"./{{ file.name }}\">{{ file.name }}</a></li>
    {% endunless %}
  {% endfor %}
</ol>
"""

template = Template(template_str)
Path('index.md').write_text(template.safe_substitute(date=timestamp))


  timestamp = datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')


1258