# NHS Board Papers Analyser

Enter a trust name, run all cells in order, and receive structured story leads.

**You will need:**
- An [Anthropic API key](https://console.anthropic.com) (create a free account → API Keys)
- `prompt_template.txt` uploaded to the Colab files panel (folder icon, left sidebar)

**Cost per run:** approximately £0.50–£1.50 using Claude Opus. Use `claude-sonnet-4-6` for ~5x cheaper.

---

In [None]:
# Cell 1: Suppress warnings and install dependencies
# Run this first, once per session.

import warnings, os, sys, logging
warnings.filterwarnings('ignore')
os.environ['PYTHONWARNINGS'] = 'ignore'
logging.captureWarnings(True)
logging.getLogger('py.warnings').setLevel(logging.CRITICAL)

try:
    import jupyter_client.session as _jcs
    import datetime as _dt
    _jcs.utcnow = lambda: _dt.datetime.now(_dt.timezone.utc).replace(tzinfo=None)
except Exception:
    pass

!pip install -q anthropic pypdfium2 requests beautifulsoup4
print('Ready.')


---
## Configure here

Edit the values below and run the cell.

> **If the search fails** (Cell 4 says 'Search failed'), paste the trust's board papers
> page URL into `MANUAL_BOARD_PAPERS_URL` in Cell 2 and re-run from Cell 4.
>
> **Serper API key** is needed for search. Get a free key (2,500 searches/month)
> at [serper.dev](https://serper.dev).
>
> **If the download fails**, download the PDF manually, upload via the files panel,
> set `MANUAL_PDF_PATH` to the filename, and re-run from Cell 5.


In [None]:
# Cell 2: Configuration -- edit these values

ANTHROPIC_API_KEY = 'sk-ant-...'   # Your Anthropic API key
SERPER_API_KEY    = ''              # Your Serper.dev key (free at serper.dev)
TRUST_NAME        = 'NHS Gloucestershire ICB'
MODEL             = 'claude-opus-4-6'   # or 'claude-sonnet-4-6' for cheaper runs

# If Cell 4 search fails: paste the board papers page URL here and re-run
MANUAL_BOARD_PAPERS_URL = ''

# If Cell 5 download fails: upload PDF manually, set filename here, re-run from Cell 5
MANUAL_PDF_PATH = ''

# -- Validation --
if not ANTHROPIC_API_KEY.startswith('sk-'):
    print('WARNING: Anthropic API key looks wrong -- should start with sk-ant-')
if not SERPER_API_KEY:
    print('NOTE: No Serper key set. Search will only use the built-in database.')
print(f'Trust:   {TRUST_NAME}')
print(f'Model:   {MODEL}')
print(f'API key: set ({ANTHROPIC_API_KEY[:12]}...)')


In [None]:
# Cell 3: Imports and helper functions

import os, io, re, zipfile, tempfile, warnings, logging, urllib.parse, json as _json
from pathlib import Path
from urllib.parse import urljoin
warnings.filterwarnings('ignore')
logging.captureWarnings(True)
logging.getLogger('py.warnings').setLevel(logging.CRITICAL)

import requests
from bs4 import BeautifulSoup
import pypdfium2 as pdfium
import anthropic

FALLBACK_UAS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
    '(KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 '
    '(KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
]

CHARS_PER_PAGE = 3000
CHAR_LIMIT     = 400_000

# ---------------------------------------------------------------------------
# Trust URL database: lookup table + Serper fallback
# ---------------------------------------------------------------------------

_TRUST_DB = None

def _load_trust_db():
    global _TRUST_DB
    if _TRUST_DB is not None:
        return _TRUST_DB
    # Try to fetch latest from GitHub
    raw_url = ('https://raw.githubusercontent.com/Davewest84/'
               'nhs-board-papers-reader/main/trust_urls.json')
    try:
        r = requests.get(raw_url, timeout=10)
        if r.status_code == 200:
            _TRUST_DB = r.json()
            print(f'  Database loaded: {len(_TRUST_DB)} organisations')
            return _TRUST_DB
    except Exception:
        pass
    # Fallback: try local copy
    for p in [Path('/content/trust_urls.json'), Path('trust_urls.json')]:
        if p.exists():
            _TRUST_DB = _json.loads(p.read_text())
            print(f'  Database loaded (local): {len(_TRUST_DB)} organisations')
            return _TRUST_DB
    _TRUST_DB = []
    print('  WARNING: Trust database not available.')
    return _TRUST_DB

def _normalize(s):
    s = s.lower().strip()
    s = re.sub(r'^nhs\s+', '', s)
    for suffix in [' nhs foundation trust', ' nhs trust', ' foundation trust',
                   ' nhs', ' integrated care board', ' icb']:
        if s.endswith(suffix):
            s = s[:-len(suffix)].strip()
    return re.sub(r'\s+', ' ', s).strip()

def _lookup_trust(trust_name, trust_db):
    query = _normalize(trust_name)
    best_url, best_score = None, 0
    for entry in trust_db:
        for name in entry['names']:
            stored = _normalize(name)
            if query == stored:
                return entry['url']   # exact match wins immediately
            if query in stored:
                score = len(query)
            elif stored in query:
                score = len(stored)
            else:
                continue
            if score > best_score:
                best_score, best_url = score, entry['url']
    return best_url

def _verify_url(url):
    """Return url if it responds HTTP < 400, else None."""
    headers = {'User-Agent': FALLBACK_UAS[0]}
    try:
        r = requests.head(url, timeout=10, allow_redirects=True, headers=headers)
        if r.status_code < 400:
            return url
        # Some servers reject HEAD -- try GET
        r2 = requests.get(url, timeout=10, allow_redirects=True,
                          headers=headers, stream=True)
        r2.close()
        if r2.status_code < 400:
            return url
    except Exception:
        pass
    return None

def _search_serper(trust_name, api_key):
    """Search via Serper.dev (Google search API -- works from cloud IPs)."""
    endpoint = 'https://google.serper.dev/search'
    headers = {'X-API-KEY': api_key, 'Content-Type': 'application/json'}
    board_kws = ['board-papers', 'board-meeting', 'board-meetings', 'trust-board',
                 'board_papers', 'board-pack', 'board-meetings-and-papers',
                 'governance/board']
    queries = [
        f'"{trust_name}" board papers site:nhs.uk',
        f'"{trust_name}" board meeting papers 2026',
    ]
    for q in queries:
        try:
            resp = requests.post(endpoint, json={'q': q, 'num': 10, 'gl': 'uk'},
                                 headers=headers, timeout=15)
            data = resp.json()
            for result in data.get('organic', []):
                link = result.get('link', '')
                if any(kw in link.lower() for kw in board_kws):
                    return link
        except Exception as e:
            print(f'    Serper error: {e}')
            break
    return None

def find_board_papers_url(trust_name):
    """Step 1: database lookup + verify. Step 2: Serper if needed."""
    trust_db = _load_trust_db()

    if trust_db:
        candidate = _lookup_trust(trust_name, trust_db)
        if candidate:
            print(f'  Database match: {candidate}')
            if _verify_url(candidate):
                print('  URL verified OK.')
                return candidate
            print('  Stored URL appears broken -- searching for updated URL...')

    serper_key = SERPER_API_KEY
    if serper_key and serper_key.strip():
        print('  Searching via Serper...')
        url = _search_serper(trust_name, serper_key)
        if url:
            print(f'  Found via Serper: {url}')
            print('  (If this is correct and recurring, add to trust_urls.json)')
            return url
    else:
        print('  No Serper key set -- add SERPER_API_KEY in Cell 2 for search fallback.')

    return None

# ---------------------------------------------------------------------------
# Fetch index page and find document links
# ---------------------------------------------------------------------------

def get_document_links(session, index_url):
    try:
        resp = session.get(index_url, timeout=30)
        resp.raise_for_status()
    except Exception as e:
        print(f'  Could not fetch index page: {e}')
        return []
    soup = BeautifulSoup(resp.text, 'html.parser')
    links, seen = [], set()
    doc_exts = ('.pdf', '.zip', '.docx')
    doc_kws  = ['download', 'document', '/file', 'attachment', 'board-paper']
    for a in soup.find_all('a', href=True):
        href = a['href']
        text = a.get_text(strip=True) or href
        h = href.lower()
        if any(h.endswith(e) for e in doc_exts) or any(k in h for k in doc_kws):
            full = href if href.startswith('http') else urljoin(index_url, href)
            if full not in seen:
                seen.add(full)
                links.append({'text': text[:100], 'url': full})
    return links

def pick_best_link(links):
    if not links: return None
    priority = ['2026', '2025', 'january', 'february', 'march', 'november',
                'board-pack', 'combined', 'agenda']
    for link in links:
        if any(t in (link['text'] + ' ' + link['url']).lower() for t in priority):
            return link['url']
    for link in links:
        if '.pdf' in link['url'].lower():
            return link['url']
    return links[0]['url']

# ---------------------------------------------------------------------------
# Download
# ---------------------------------------------------------------------------

def download_file(session, url, referer):
    for i, ua in enumerate(FALLBACK_UAS):
        headers = {'User-Agent': ua, 'Referer': referer,
                   'Accept': 'application/pdf,application/zip,*/*'}
        try:
            resp = session.get(url, headers=headers, timeout=120)
            if resp.status_code == 200 and len(resp.content) > 10_000:
                return resp.content
            print(f'  Attempt {i+1}: HTTP {resp.status_code}')
        except Exception as e:
            print(f'  Attempt {i+1} failed: {e}')
    return None

def save_and_unpack(data, save_dir):
    os.makedirs(save_dir, exist_ok=True)
    if data[:2] == b'PK':
        print('  ZIP detected -- extracting PDFs...')
        paths = []
        try:
            with zipfile.ZipFile(io.BytesIO(data)) as zf:
                for name in zf.namelist():
                    if name.lower().endswith('.pdf') and not name.startswith('__MACOSX'):
                        safe = os.path.basename(name) or f'file_{len(paths)}.pdf'
                        out = os.path.join(save_dir, safe)
                        with open(out, 'wb') as f: f.write(zf.read(name))
                        paths.append(out)
                        print(f'  Extracted: {safe}')
        except zipfile.BadZipFile:
            print('  ZIP extraction failed.')
        return paths
    else:
        out = os.path.join(save_dir, 'board_papers.pdf')
        with open(out, 'wb') as f: f.write(data)
        print(f'  Saved: board_papers.pdf ({len(data):,} bytes)')
        return [out]

# ---------------------------------------------------------------------------
# Text extraction
# ---------------------------------------------------------------------------

def extract_pages(pdf, start, end):
    parts = []
    for i in range(start, min(end, len(pdf))):
        try:
            text = pdf[i].get_textpage().get_text_range()
            if text.strip():
                parts.append(f'-- Page {i+1} --\n{text[:CHARS_PER_PAGE]}')
        except Exception:
            pass
    return '\n'.join(parts)

def find_section_starts(agenda_text, total):
    patterns = {
        'ceo_report':  r'chief executive[^\n]{0,60}?(\d{1,3})\b',
        'finance':     r'finance report[^\n]{0,60}?(\d{1,3})\b',
        'performance': r'(?:integrated performance|ipr)[^\n]{0,60}?(\d{1,3})\b',
        'quality':     r'quality[^\n]{0,60}?(\d{1,3})\b',
        'workforce':   r'(?:people committee|workforce)[^\n]{0,60}?(\d{1,3})\b',
    }
    secs = {}
    for name, pat in patterns.items():
        m = re.search(pat, agenda_text.lower())
        if m:
            p = int(m.group(1))
            if 3 <= p <= total:
                secs[name] = p - 1
    return secs

def extract_targeted_text(pdf_paths):
    all_secs = {}
    for pdf_path in pdf_paths:
        label = os.path.basename(pdf_path)
        print(f'  Reading: {label}')
        try:
            pdf = pdfium.PdfDocument(pdf_path)
        except Exception as e:
            print(f'  Could not open: {e}')
            continue
        total = len(pdf)
        print(f'  Pages: {total}')
        agenda = extract_pages(pdf, 0, min(6, total))
        all_secs[f'{label}__agenda'] = agenda
        secs = find_section_starts(agenda, total)
        if secs:
            print(f'  Sections found: {list(secs.keys())}')
            for sname, start in secs.items():
                all_secs[f'{label}__{sname}'] = extract_pages(pdf, start, min(start+30, total))
        else:
            print('  No agenda page refs -- reading in thirds')
            chunk = max(20, total // 3)
            all_secs[f'{label}__part_1'] = extract_pages(pdf, 0, chunk)
            all_secs[f'{label}__part_2'] = extract_pages(pdf, chunk, chunk*2)
            all_secs[f'{label}__part_3'] = extract_pages(pdf, chunk*2, total)
    return all_secs

# ---------------------------------------------------------------------------
# Prompt template
# ---------------------------------------------------------------------------

_template_paths = [Path('/content/prompt_template.txt'), Path('prompt_template.txt')]
PROMPT_TEMPLATE = None
for _p in _template_paths:
    if _p.exists():
        PROMPT_TEMPLATE = _p.read_text(encoding='utf-8')
        print('Prompt template loaded.')
        break

print('Helper functions ready.')
if not PROMPT_TEMPLATE:
    print('\nWARNING: prompt_template.txt not found.')
    print('Upload it via the files panel (folder icon, left sidebar).')


In [None]:
# Cell 4: Find board papers page and document links

pdf_paths = []

if MANUAL_PDF_PATH:
    print(f"Using manually provided PDF: {MANUAL_PDF_PATH}")
    board_papers_url = MANUAL_BOARD_PAPERS_URL or "(manual upload)"
    selected_url = MANUAL_PDF_PATH
    pdf_paths = [MANUAL_PDF_PATH]
else:
    board_papers_url = MANUAL_BOARD_PAPERS_URL

    if not board_papers_url:
        print(f"Searching for: {TRUST_NAME}")
        board_papers_url = find_board_papers_url(TRUST_NAME)

    if board_papers_url:
        print(f"\nBoard papers page: {board_papers_url}")
    else:
        print("\n" + "="*60)
        print("Search failed — all three methods tried. Action needed:")
        print("  1. Visit the trust website in your browser")
        print("  2. Find the board papers or board meetings page")
        print("  3. Copy that page URL")
        print("  4. Paste it into MANUAL_BOARD_PAPERS_URL in Cell 2")
        print("  5. Re-run Cell 2, then this cell")
        print("="*60)
        raise SystemExit("Set MANUAL_BOARD_PAPERS_URL and re-run.")

    session = requests.Session()
    session.headers["User-Agent"] = FALLBACK_UAS[0]
    try: session.get(board_papers_url, timeout=20)
    except Exception: pass

    print("\nFetching document links...")
    links = get_document_links(session, board_papers_url)

    if links:
        print(f"Found {len(links)} document link(s):")
        for i, link in enumerate(links[:15]):
            print(f"  [{i}] {link['text'][:70]}")
        selected_url = pick_best_link(links)
        print(f"\nAuto-selected: {selected_url}")
        print("\nTo use a different link: paste its URL into MANUAL_BOARD_PAPERS_URL")
        print("in Cell 2, then re-run Cell 2 and skip straight to Cell 5.")
    else:
        print("\n" + "="*60)
        print("No document links found on the index page. Action needed:")
        print("  1. Visit the board papers page in your browser:")
        print(f"     {board_papers_url}")
        print("  2. Right-click the PDF and copy its URL")
        print("  3. Paste into MANUAL_BOARD_PAPERS_URL in Cell 2")
        print("  4. Re-run Cell 2, then skip to Cell 5")
        print("="*60)
        raise SystemExit("Set MANUAL_BOARD_PAPERS_URL to the direct PDF URL and re-run.")

In [None]:
# Cell 5: Download PDF

if not pdf_paths:
    save_dir = tempfile.mkdtemp(prefix="nhspapers_")
    print(f"Downloading: {selected_url}")
    data = download_file(session, selected_url, board_papers_url)

    if data is not None:
        pdf_paths = save_and_unpack(data, save_dir)
    else:
        print()
        print("Sorry this site blocks automated downloads - if you like you can manually")
        print("upload a board paper PDF to the file panel on the left, and I will process it.")
        print()
        print("Steps:")
        print("  1. Download the PDF from the trust website in your browser")
        print("  2. Click the folder icon in Colab's left panel")
        print("  3. Click the upload icon and select your PDF")
        print("  4. Set MANUAL_PDF_PATH = 'your_filename.pdf' in Cell 2")
        print("  5. Re-run Cell 2, then skip to Cell 6")
        raise SystemExit("Upload PDF manually and set MANUAL_PDF_PATH in Cell 2.")

if pdf_paths:
    print(f"\nReady: {[os.path.basename(p) for p in pdf_paths]}")

In [None]:
# Cell 6: Extract text from PDF(s)

if not pdf_paths:
    print("No PDFs to process. Run Cell 5 first.")
else:
    print("Extracting text...")
    extracted = extract_targeted_text(pdf_paths)
    total_chars = sum(len(v) for v in extracted.values())
    print(f"\nDone: {len(extracted)} section(s), {total_chars:,} characters")

In [None]:
# Cell 7: Analyse with Claude

if 'extracted' not in dir() or not extracted:
    print("No extracted text. Run Cell 6 first.")
elif PROMPT_TEMPLATE is None:
    print("ERROR: prompt_template.txt not found. Upload it via the files panel.")
else:
    client = anthropic.Anthropic(api_key=ANTHROPIC_API_KEY)

    parts, total_chars = [], 0
    for section, text in extracted.items():
        if not text.strip(): continue
        header = f"\n\n=== {section.upper().replace('_', ' ')} ===\n"
        if total_chars + len(header) + len(text) > CHAR_LIMIT:
            print("Character limit reached — truncating")
            break
        parts.append(header + text)
        total_chars += len(header) + len(text)

    combined_text = "".join(parts)
    print(f"Sending {total_chars:,} characters to {MODEL}...")

    prompt = (
        PROMPT_TEMPLATE
        .replace("{{TRUST_NAME}}", TRUST_NAME)
        .replace("{{BOARD_PAPERS_URL}}", board_papers_url)
        .replace("{{EXTRACTED_TEXT}}", combined_text)
    )

    message = client.messages.create(
        model=MODEL,
        max_tokens=4096,
        messages=[{"role": "user", "content": prompt}],
    )

    usage = message.usage
    print(f"Tokens: {usage.input_tokens:,} in / {usage.output_tokens:,} out")
    story_leads = message.content[0].text

    print("\n" + "=" * 60)
    print("STORY LEADS")
    print("=" * 60 + "\n")
    print(story_leads)

In [None]:
# Cell 8: Save and download results

if 'story_leads' not in dir():
    print("No results yet. Run Cell 7 first.")
else:
    safe_name = TRUST_NAME.replace(" ", "_").replace("/", "-")
    output_file = f"{safe_name}_leads.md"

    with open(output_file, "w", encoding="utf-8") as f:
        f.write(f"# Story leads: {TRUST_NAME}\n\n")
        f.write(f"Source: {board_papers_url}\n\n---\n\n")
        f.write(story_leads)

    print(f"Saved: {output_file}")

    try:
        from google.colab import files
        files.download(output_file)
        print("Downloading to your computer...")
    except ImportError:
        print("(Running locally — file saved to current directory)")