# NHS Board Papers Analyser

Enter a trust name, run all cells in order, and receive structured story leads.

**You will need:** An [Anthropic API key](https://console.anthropic.com) (create a free account, then go to API Keys).

**Cost per run:** approximately £0.50–£1.50 depending on pack size (using Claude Opus). Use `claude-sonnet-4-6` in the config cell for ~5x cheaper results.

---

In [None]:
# Cell 1: Install dependencies (run once per session)
!pip install -q anthropic pypdfium2 duckduckgo-search requests beautifulsoup4
print("Dependencies installed.")

---
## Configure here

Edit the three values below, then run the cell.

In [None]:
# Cell 2: Configuration — edit these values

ANTHROPIC_API_KEY = "sk-ant-..."          # Your Anthropic API key
TRUST_NAME        = "Sussex Community NHS Foundation Trust"
MODEL             = "claude-opus-4-6"     # or "claude-sonnet-4-6" for cheaper runs

# Optional: paste the board papers page URL here to skip the search step
# Leave as "" to search automatically
MANUAL_BOARD_PAPERS_URL = ""

# Optional: if the download fails and you upload a PDF manually,
# set this to the filename after uploading (e.g. "board_papers.pdf")
MANUAL_PDF_PATH = ""

# ── Validation ──
if not ANTHROPIC_API_KEY.startswith("sk-"):
    print("WARNING: API key does not look right — check it starts with sk-ant-")
else:
    print(f"Trust:     {TRUST_NAME}")
    print(f"Model:     {MODEL}")
    print(f"API key:   set ({ANTHROPIC_API_KEY[:12]}...)")

In [None]:
# Cell 3: Imports and helper functions

import os, io, re, sys, zipfile, tempfile
from pathlib import Path
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup
import pypdfium2 as pdfium
import anthropic
from duckduckgo_search import DDGS

FALLBACK_UAS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
    "(KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 "
    "(KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
]

BOARD_PAPER_KWS = [
    "board-papers", "board-meeting", "board-meetings", "trust-board",
    "board-of-directors", "board_papers", "board-pack",
]

CHARS_PER_PAGE = 3000
CHAR_LIMIT     = 400_000

# ── Search ──
def find_board_papers_url(trust_name):
    queries = [
        f'"{trust_name}" board papers 2025 OR 2026 site:nhs.uk',
        f'"{trust_name}" board meeting papers site:nhs.uk',
        f'"{trust_name}" NHS board papers minutes 2026',
    ]
    with DDGS() as ddg:
        for query in queries:
            try:
                for r in ddg.text(query, max_results=8):
                    url = r.get("href", "")
                    if any(kw in url.lower() for kw in BOARD_PAPER_KWS):
                        return url
            except Exception as e:
                print(f"  Search attempt failed: {e}")
    return None

# ── Fetch index and find links ──
def get_document_links(session, index_url):
    try:
        resp = session.get(index_url, timeout=30)
        resp.raise_for_status()
    except Exception as e:
        print(f"  Could not fetch index page: {e}")
        return []
    soup = BeautifulSoup(resp.text, "html.parser")
    links, seen = [], set()
    doc_exts = (".pdf", ".zip", ".docx")
    doc_kws  = ["download", "document", "/file", "attachment", "board-paper"]
    for a in soup.find_all("a", href=True):
        href = a["href"]
        text = a.get_text(strip=True) or href
        h = href.lower()
        if any(h.endswith(e) for e in doc_exts) or any(k in h for k in doc_kws):
            full = href if href.startswith("http") else urljoin(index_url, href)
            if full not in seen:
                seen.add(full)
                links.append({"text": text[:100], "url": full})
    return links

def pick_best_link(links):
    if not links: return None
    priority = ["2026","2025","january","february","march","november",
                "board-pack","combined","agenda"]
    for link in links:
        if any(t in (link["text"]+" "+link["url"]).lower() for t in priority):
            return link["url"]
    for link in links:
        if ".pdf" in link["url"].lower(): return link["url"]
    return links[0]["url"]

# ── Download ──
def download_file(session, url, referer):
    for i, ua in enumerate(FALLBACK_UAS):
        headers = {"User-Agent": ua, "Referer": referer,
                   "Accept": "application/pdf,application/zip,*/*"}
        try:
            resp = session.get(url, headers=headers, timeout=120)
            if resp.status_code == 200 and len(resp.content) > 10_000:
                return resp.content
            print(f"  Attempt {i+1}: HTTP {resp.status_code}")
        except Exception as e:
            print(f"  Attempt {i+1} failed: {e}")
    return None

def save_and_unpack(data, save_dir):
    os.makedirs(save_dir, exist_ok=True)
    if data[:2] == b"PK":
        print("  ZIP detected — extracting PDFs...")
        paths = []
        try:
            with zipfile.ZipFile(io.BytesIO(data)) as zf:
                for name in zf.namelist():
                    if name.lower().endswith(".pdf") and not name.startswith("__MACOSX"):
                        safe = os.path.basename(name) or f"file_{len(paths)}.pdf"
                        out = os.path.join(save_dir, safe)
                        with open(out, "wb") as f: f.write(zf.read(name))
                        paths.append(out)
                        print(f"  Extracted: {safe}")
        except zipfile.BadZipFile:
            print("  ZIP extraction failed.")
        return paths
    else:
        out = os.path.join(save_dir, "board_papers.pdf")
        with open(out, "wb") as f: f.write(data)
        print(f"  Saved: board_papers.pdf ({len(data):,} bytes)")
        return [out]

# ── Extract text ──
def extract_pages(pdf, start, end):
    parts = []
    for i in range(start, min(end, len(pdf))):
        try:
            text = pdf[i].get_textpage().get_text_range()
            if text.strip():
                parts.append(f"-- Page {i+1} --\n{text[:CHARS_PER_PAGE]}")
        except Exception: pass
    return "\n".join(parts)

def find_section_starts(agenda_text, total):
    patterns = {
        "ceo_report":  r"chief executive[^\n]{0,60}?(\d{1,3})\b",
        "finance":     r"finance report[^\n]{0,60}?(\d{1,3})\b",
        "performance": r"(?:integrated performance|ipr)[^\n]{0,60}?(\d{1,3})\b",
        "quality":     r"quality[^\n]{0,60}?(\d{1,3})\b",
        "workforce":   r"(?:people committee|workforce)[^\n]{0,60}?(\d{1,3})\b",
    }
    secs = {}
    for name, pat in patterns.items():
        m = re.search(pat, agenda_text.lower())
        if m:
            p = int(m.group(1))
            if 3 <= p <= total: secs[name] = p - 1
    return secs

def extract_targeted_text(pdf_paths):
    all_secs = {}
    for pdf_path in pdf_paths:
        label = os.path.basename(pdf_path)
        print(f"  Reading: {label}")
        try: pdf = pdfium.PdfDocument(pdf_path)
        except Exception as e:
            print(f"  Could not open: {e}"); continue
        total = len(pdf)
        print(f"  Pages: {total}")
        agenda = extract_pages(pdf, 0, min(6, total))
        all_secs[f"{label}__agenda"] = agenda
        secs = find_section_starts(agenda, total)
        if secs:
            print(f"  Sections: {list(secs.keys())}")
            for sname, start in secs.items():
                all_secs[f"{label}__{sname}"] = extract_pages(pdf, start, min(start+30, total))
        else:
            print("  No agenda refs found — reading in thirds")
            chunk = max(20, total // 3)
            all_secs[f"{label}__part_1"] = extract_pages(pdf, 0, chunk)
            all_secs[f"{label}__part_2"] = extract_pages(pdf, chunk, chunk*2)
            all_secs[f"{label}__part_3"] = extract_pages(pdf, chunk*2, total)
    return all_secs

# ── Load prompt ──
PROMPT_TEMPLATE = open(
    Path("prompt_template.txt") if Path("prompt_template.txt").exists()
    else Path("/content/prompt_template.txt"), encoding="utf-8"
).read() if (Path("prompt_template.txt").exists() or Path("/content/prompt_template.txt").exists()) else None

print("Helper functions loaded.")
if PROMPT_TEMPLATE:
    print("Prompt template loaded.")
else:
    print("WARNING: prompt_template.txt not found. Upload it to the files panel or the /content directory.")

In [None]:
# Cell 4: Find board papers page and document links

if MANUAL_PDF_PATH:
    print(f"Using manually provided PDF: {MANUAL_PDF_PATH}")
    board_papers_url = MANUAL_BOARD_PAPERS_URL or "(manual upload)"
    selected_url = MANUAL_PDF_PATH
    pdf_paths = [MANUAL_PDF_PATH]
else:
    board_papers_url = MANUAL_BOARD_PAPERS_URL

    if not board_papers_url:
        print(f"Searching for board papers page: {TRUST_NAME}...")
        board_papers_url = find_board_papers_url(TRUST_NAME)
        if board_papers_url:
            print(f"Found: {board_papers_url}")
        else:
            print("Could not find automatically.")
            board_papers_url = input("Paste the board papers URL: ").strip()
    else:
        print(f"Using provided URL: {board_papers_url}")

    session = requests.Session()
    session.headers["User-Agent"] = FALLBACK_UAS[0]
    # Visit index page to pick up session cookies
    try: session.get(board_papers_url, timeout=20)
    except Exception: pass

    print("\nFetching document links...")
    links = get_document_links(session, board_papers_url)

    if links:
        print(f"\nFound {len(links)} document link(s):")
        for i, link in enumerate(links[:15]):
            print(f"  [{i}] {link['text'][:70]}")
        selected_url = pick_best_link(links)
        print(f"\nAuto-selected: {selected_url}")
    else:
        print("No links found on index page.")
        selected_url = input("Paste the direct PDF URL: ").strip()
    pdf_paths = []

In [None]:
# Cell 5: Download PDF
# If the download fails, the cell will tell you what to do.

if not pdf_paths:  # skip if PDF already provided
    save_dir = tempfile.mkdtemp(prefix="nhspapers_")
    print(f"Downloading: {selected_url}")
    data = download_file(session, selected_url, board_papers_url)

    if data is not None:
        pdf_paths = save_and_unpack(data, save_dir)
    else:
        print("""
Sorry this site blocks automated downloads - if you like you can manually
upload a board paper PDF to the file panel on the left, and I will process it.
""")
        # Colab file upload
        try:
            from google.colab import files
            print("Use the upload button above, or run: uploaded = files.upload()")
            uploaded = files.upload()
            if uploaded:
                fname = list(uploaded.keys())[0]
                pdf_paths = [fname]
                print(f"Using uploaded file: {fname}")
        except ImportError:
            # Running locally, not in Colab
            path = input("Enter path to manually downloaded PDF: ").strip()
            if path: pdf_paths = [path]

if pdf_paths:
    print(f"\nReady to process {len(pdf_paths)} PDF(s): {[os.path.basename(p) for p in pdf_paths]}")
else:
    print("No PDF to process. Check the download or upload a file manually.")

In [None]:
# Cell 6: Extract text from PDF(s)

if pdf_paths:
    print("Extracting text...")
    extracted = extract_targeted_text(pdf_paths)
    total_chars = sum(len(v) for v in extracted.values())
    print(f"\nExtracted {len(extracted)} section(s), {total_chars:,} characters total")
else:
    print("No PDFs to extract from. Run Cell 5 first.")

In [None]:
# Cell 7: Analyse with Claude

if not pdf_paths or not extracted:
    print("No extracted text. Run Cell 6 first.")
elif PROMPT_TEMPLATE is None:
    print("ERROR: prompt_template.txt not found. Upload it to the files panel.")
else:
    client = anthropic.Anthropic(api_key=ANTHROPIC_API_KEY)

    # Build combined text up to character limit
    parts = []
    total_chars = 0
    for section, text in extracted.items():
        if not text.strip(): continue
        header = f"\n\n=== {section.upper().replace('_', ' ')} ===\n"
        if total_chars + len(header) + len(text) > CHAR_LIMIT:
            print(f"  Character limit reached — truncating")
            break
        parts.append(header + text)
        total_chars += len(header) + len(text)

    combined_text = "".join(parts)
    print(f"Sending {total_chars:,} characters to {MODEL}...")

    prompt = (
        PROMPT_TEMPLATE
        .replace("{{TRUST_NAME}}", TRUST_NAME)
        .replace("{{BOARD_PAPERS_URL}}", board_papers_url)
        .replace("{{EXTRACTED_TEXT}}", combined_text)
    )

    message = client.messages.create(
        model=MODEL,
        max_tokens=4096,
        messages=[{"role": "user", "content": prompt}],
    )

    usage = message.usage
    print(f"Tokens used: {usage.input_tokens:,} in / {usage.output_tokens:,} out")
    story_leads = message.content[0].text

    print("\n" + "=" * 60)
    print("STORY LEADS")
    print("=" * 60 + "\n")
    print(story_leads)

In [None]:
# Cell 8: Save output
# Saves the story leads as a markdown file and (in Colab) downloads it.

if 'story_leads' in dir():
    safe_name = TRUST_NAME.replace(" ", "_").replace("/", "-")
    output_file = f"{safe_name}_leads.md"

    with open(output_file, "w", encoding="utf-8") as f:
        f.write(f"# Story leads: {TRUST_NAME}\n\n")
        f.write(f"Source: {board_papers_url}\n\n---\n\n")
        f.write(story_leads)

    print(f"Saved: {output_file}")

    # Auto-download in Colab
    try:
        from google.colab import files
        files.download(output_file)
        print("Downloading to your computer...")
    except ImportError:
        print(f"(Running locally — file saved to current directory)")
else:
    print("No story leads to save. Run Cell 7 first.")