# Golden Set Annotation Notebook

Use this notebook to review preprocessed documents for the Phase 1 golden set and record human annotations that mirror model outputs. The flow is:

1) Load a `run_id`'s `documents.parquet` from `data/processed/{run_id}/`.
2) Preview documents and select spans/sections to annotate.
3) Add annotations to an in-memory buffer.
4) Append annotations to `data/annotations/human/annotations.parquet` (and `annotations.jsonl`) in an append-only manner, preserving run_id and document_id.

Keep the file append-only. If you need to correct an entry, append a new record with the fix and an explanatory note.


In [27]:
from pathlib import Path
from datetime import datetime
import json
import uuid
import pandas as pd

# Paths
PIPELINE_ROOT = Path(".." ).resolve()
REPO_ROOT = PIPELINE_ROOT.parent
DATA_DIR = REPO_ROOT / "data"
ANNOTATIONS_DIR = DATA_DIR / "annotations" / "human"
ANNOTATIONS_DIR.mkdir(parents=True, exist_ok=True)

# Configure the run to review
RUN_ID = "gs-phase1-20260107-143822"  # update to the run_id you want to annotate
DOCUMENTS_PATH = DATA_DIR / "processed" / RUN_ID / "documents.parquet"
ANNOTATIONS_PARQUET = ANNOTATIONS_DIR / "annotations.parquet"
ANNOTATIONS_JSONL = ANNOTATIONS_PARQUET.with_suffix(".jsonl")

print(f"Using run: {RUN_ID}")
print(f"Documents: {DOCUMENTS_PATH}")
print(f"Annotations (parquet): {ANNOTATIONS_PARQUET}")
print(f"Annotations (jsonl):   {ANNOTATIONS_JSONL}")


Using run: gs-phase1-20260107-143822
Documents: /Users/84rt/Projects/AI Risk Observatory/data/processed/gs-phase1-20260107-143822/documents.parquet
Annotations (parquet): /Users/84rt/Projects/AI Risk Observatory/data/annotations/human/annotations.parquet
Annotations (jsonl):   /Users/84rt/Projects/AI Risk Observatory/data/annotations/human/annotations.jsonl


In [28]:
def load_documents(path: Path) -> pd.DataFrame:
    if not path.exists():
        raise FileNotFoundError(f"Documents not found: {path}")
    df = pd.read_parquet(path)
    return df

# Load documents for the configured run
DOCS = load_documents(DOCUMENTS_PATH)
print(f"Loaded {len(DOCS)} documents")
DOCS.head(3)[[
    "document_id",
    "company_name",
    "cni_sector",
    "year",
    "source_format",
    "spans_original",
    "spans_retained",
]]

Loaded 26 documents


Unnamed: 0,document_id,company_name,cni_sector,year,source_format,spans_original,spans_retained
0,00033774-2024-chemicals-ixbrl,Johnson Matthey plc,Chemicals,2024,ixbrl,18021,18021
1,00033774-2023-chemicals-ixbrl,Johnson Matthey plc,Chemicals,2023,ixbrl,15773,15773
2,07524813-2024-civil-nuclear-ixbrl,Rolls-Royce Holdings plc,Civil Nuclear,2024,ixbrl,8404,8404


In [29]:
def preview_document(document_id: str, chars: int = 1200):
    row = DOCS.loc[DOCS.document_id == document_id]
    if row.empty:
        raise ValueError(f"document_id not found: {document_id}")
    rec = row.iloc[0].to_dict()
    print(json.dumps({
        "document_id": rec["document_id"],
        "company": rec["company_name"],
        "sector": rec["cni_sector"],
        "year": rec["year"],
        "format": rec["source_format"],
        "spans": rec["spans_original"],
    }, indent=2))
    snippet = rec["text_markdown"][:chars]
    print("\n--- Preview (truncated) ---\n")
    print(snippet)

# Example: replace with a real document_id from DOCS
# preview_document(DOCS.iloc[0].document_id)


In [30]:
annotations_buffer: list[dict] = []


def add_annotation(
    document_id: str,
    *,
    dimension: str,
    label: str,
    text_excerpt: str,
    report_section: str | None = None,
    ai_specificity: str = "specific",
    frontier_tech_flag: bool = False,
    tier_1_category: str | None = None,
    tier_2_driver: str | None = None,
    specificity_level: str | None = None,
    materiality_signal: str | None = None,
    mitigation_mentioned: bool | None = None,
    governance_maturity: str | None = None,
    severity: str | None = None,
    mitigation_score: float | None = None,
    confidence: float = 1.0,
    classifier_id: str = "human_golden",
    classifier_version: str = "v1",
) -> dict:
    """Add one annotation record to the in-memory buffer."""
    row = DOCS.loc[DOCS.document_id == document_id]
    if row.empty:
        raise ValueError(f"document_id not found: {document_id}")
    rec = row.iloc[0].to_dict()

    annotation = {
        "annotation_id": f"ann-{uuid.uuid4().hex[:12]}",
        "document_id": rec["document_id"],
        "company_number": rec["company_number"],
        "company_name": rec["company_name"],
        "ticker": rec.get("ticker"),
        "lei": rec.get("lei"),
        "cni_sector": rec.get("cni_sector"),
        "year": int(rec["year"]),
        "dimension": dimension,  # e.g., harm | adoption | risk_disclosure
        "label": label,
        "text_excerpt": text_excerpt,
        "report_section": report_section,
        "ai_specificity": ai_specificity,
        "frontier_tech_flag": frontier_tech_flag,
        "tier_1_category": tier_1_category,
        "tier_2_driver": tier_2_driver,
        "specificity_level": specificity_level,
        "materiality_signal": materiality_signal,
        "mitigation_mentioned": mitigation_mentioned,
        "governance_maturity": governance_maturity,
        "severity": severity,
        "mitigation_score": mitigation_score,
        "confidence": confidence,
        "classifier_id": classifier_id,
        "classifier_version": classifier_version,
        "model_name": None,
        "source": "human",
        "run_id": rec.get("run_id"),
        "created_at": datetime.utcnow().isoformat(),
        "raw_path": rec.get("raw_path"),
        "source_format": rec.get("source_format"),
    }
    annotations_buffer.append(annotation)
    print(f"Added annotation {annotation['annotation_id']} for {rec['company_name']} ({rec['year']})")
    return annotation


def append_annotations_buffer():
    """Append buffered annotations to parquet/jsonl (append-only)."""
    if not annotations_buffer:
        print("Buffer is empty; nothing to write.")
        return

    new_df = pd.DataFrame(annotations_buffer)

    if ANNOTATIONS_PARQUET.exists():
        existing = pd.read_parquet(ANNOTATIONS_PARQUET)
        combined = pd.concat([existing, new_df], ignore_index=True)
    else:
        combined = new_df

    combined.to_parquet(ANNOTATIONS_PARQUET, index=False)
    combined.to_json(ANNOTATIONS_JSONL, orient="records", lines=True)

    print(f"Appended {len(new_df)} annotations (total now {len(combined)})")
    annotations_buffer.clear()


def list_recent_annotations(limit: int = 5):
    if not ANNOTATIONS_PARQUET.exists():
        print("No annotations written yet.")
        return
    df = pd.read_parquet(ANNOTATIONS_PARQUET)
    display(df.tail(limit))



In [31]:
# Example workflow (replace values before running)
# doc_id = DOCS.iloc[0].document_id
# preview_document(doc_id)
# add_annotation(
#     document_id=doc_id,
#     dimension="risk_disclosure",
#     label="cybersecurity",
#     text_excerpt="Paste the exact quote you are labeling",
#     page_number=12,
#     report_section="principal_risks",
#     ai_specificity="specific",
#     specificity_level="specific",
#     mitigation_mentioned=True,
#     severity="high",
#     confidence=0.95,
#     annotator="your_name",
# )
# append_annotations_buffer()
# list_recent_annotations()



## Helper widgets for fast annotation
Use these widgets to filter documents, preview quickly, and add annotations with minimal typing. Adjust options if you add new dimensions/labels later.


In [32]:
# Generate the doc
import textwrap
from pathlib import Path

def normalize_markdown(md: str, width: int = 100) -> str:
    # Split on blank lines to keep paragraph breaks, then wrap each paragraph
    paragraphs = md.split("\n\n")
    wrapped = [
        "\n".join(textwrap.fill(line, width) for line in para.splitlines())
        for para in paragraphs
    ]
    return "\n\n".join(wrapped)

row = DOCS.loc[DOCS.document_id == doc_dd.value].iloc[0]
clean_md = normalize_markdown(row["text_markdown"], width=100)
tmp_path = Path("current_doc.md")
tmp_path.write_text(clean_md)
print(f"Wrote human-readable markdown to {tmp_path}")

Wrote human-readable markdown to current_doc.md


In [33]:
# Progress helper: counts annotations already saved for this run_id
import os

existing_ann = None
if ANNOTATIONS_PARQUET.exists():
    existing_ann = pd.read_parquet(ANNOTATIONS_PARQUET)

def show_progress(run_id: str = RUN_ID):
    total_docs = len(DOCS)
    saved = 0
    if existing_ann is not None:
        saved = existing_ann[existing_ann["run_id"] == run_id]["document_id"].nunique()
    print(f"Run {run_id}: {saved}/{total_docs} documents have at least one human annotation.")
    print(f"Buffer (unsaved) annotations: {len(annotations_buffer)}")

show_progress()


Run gs-phase1-20260107-143822: 3/26 documents have at least one human annotation.
Buffer (unsaved) annotations: 0


In [41]:
import ipywidgets as widgets
from IPython.display import display, clear_output

META_COLS = ["document_id", "company_name", "cni_sector", "year", "source_format"]
DOCS_META = DOCS[META_COLS].copy()

DIMENSION_OPTIONS = ["risk_disclosure", "harm", "adoption", "vendor"]

# Taxonomy-driven labels per dimension (from choices_report)
LABEL_OPTIONS = {
    "risk_disclosure": [
        "operational_technical",
        "cybersecurity",
        "workforce_impacts",
        "regulatory_compliance",
        "information_integrity",
        "reputational_ethical",
        "third_party_supply_chain",
        "environmental_impact",
        "national_security",
    ],
    "harm": ["mentioned", "not_mentioned"],
    "adoption": ["non_llm", "llm", "agentic_ai"],
    "vendor": [
        "internal / opensource",
        "Microsoft",
        "OpenAI",
        "Google",
        "Anthropic",
        "Mistral",
        "unspecified",
        "other",
    ],
    # Substantiveness is only used as a property of risk_disclosure
    "substantiveness": ["boilerplate", "substantive"],
}

# Dimension dropdown (added to fix NameError)
dim_dd = widgets.Dropdown(description="dimension", options=DIMENSION_OPTIONS, value=DIMENSION_OPTIONS[0])

companies = sorted(DOCS_META.company_name.unique())
years = sorted(DOCS_META.year.unique())

company_dd = widgets.Dropdown(options=["(all)"] + companies, description="Company")
year_dd = widgets.Dropdown(options=["(all)"] + [int(y) for y in years], description="Year")
doc_dd = widgets.Dropdown(options=[], description="document_id", layout=widgets.Layout(width="60%"))
preview_out = widgets.Output()

def _refresh_doc_options(*_):
    company = None if company_dd.value == "(all)" else company_dd.value
    year = None if year_dd.value == "(all)" else int(year_dd.value)
    df = DOCS_META
    if company:
        df = df[df.company_name == company]
    if year:
        df = df[df.year == year]
    df = df.sort_values(["company_name", "year", "document_id"])
    options = [(f"{r.company_name} {r.year} ({r.source_format})", r.document_id) for r in df.itertuples()]
    doc_dd.options = options
    if options:
        doc_dd.value = options[0][1]

def _on_preview(_):
    preview_out.clear_output()
    if not doc_dd.value:
        return
    with preview_out:
        preview_document(doc_dd.value, chars=1800)

company_dd.observe(_refresh_doc_options, names="value")
year_dd.observe(_refresh_doc_options, names="value")
doc_dd.observe(_on_preview, names="value")

_refresh_doc_options()

controls = widgets.VBox([
    widgets.HBox([company_dd, year_dd]),
    doc_dd,
    widgets.Button(description="Preview", button_style="info", tooltip="Preview selected document", on_click=_on_preview),
])

display(controls, preview_out)

VBox(children=(HBox(children=(Dropdown(description='Company', options=('(all)', 'AstraZeneca plc', 'BAE System…

Output()

In [35]:
# Quick add form to minimize typing (dropdowns + substantiveness confidence)
label_dd = widgets.Dropdown(description="label", options=LABEL_OPTIONS[dim_dd.value], layout=widgets.Layout(width="50%"))
substance_dd = widgets.Dropdown(description="substantiveness", options=LABEL_OPTIONS["substantiveness"], value="substantive")
confidence_input = widgets.FloatSlider(description="confidence", min=0.0, max=1.0, step=0.05, value=0.95)
substance_conf_input = widgets.FloatSlider(description="subst. conf", min=0.0, max=1.0, step=0.05, value=0.9)
excerpt_ta = widgets.Textarea(description="excerpt", layout=widgets.Layout(width="80%", height="120px"))
section_input = widgets.Text(description="section", placeholder="e.g., principal_risks")

# Show substantiveness controls only when dimension == risk_disclosure
substance_box = widgets.HBox([substance_dd, substance_conf_input])
substance_box.layout.display = "flex" if dim_dd.value == "risk_disclosure" else "none"

add_out = widgets.Output()


def _on_dimension_change(change):
    if change["name"] == "value":
        label_dd.options = LABEL_OPTIONS.get(change["new"], [])
        if label_dd.options:
            label_dd.value = label_dd.options[0]
        # Toggle substantiveness controls only for risk_disclosure
        substance_box.layout.display = "flex" if change["new"] == "risk_disclosure" else "none"

dim_dd.observe(_on_dimension_change, names="value")


def _on_add_click(_):
    add_out.clear_output()
    if not doc_dd.value:
        with add_out:
            print("Select a document first.")
        return
    with add_out:
        ann = add_annotation(
            document_id=doc_dd.value,
            dimension=dim_dd.value,
            label=label_dd.value,
            text_excerpt=excerpt_ta.value.strip(),
            report_section=section_input.value.strip() or None,
            confidence=float(confidence_input.value),
            # Map substantiveness into fields for risk_disclosure context
            tier_1_category=label_dd.value if dim_dd.value == "risk_disclosure" else None,
            specificity_level=substance_dd.value if dim_dd.value == "risk_disclosure" else None,
            mitigation_score=float(substance_conf_input.value) if dim_dd.value == "risk_disclosure" else None,
        )
        print(f"Buffered annotation {ann['annotation_id']} for {ann['document_id']}")
        # Light reset
        excerpt_ta.value = ""

add_btn = widgets.Button(description="Add annotation", button_style="success")
add_btn.on_click(_on_add_click)

form = widgets.VBox([
    widgets.HBox([dim_dd, label_dd]),
    excerpt_ta,
    widgets.HBox([section_input]),
    widgets.HBox([confidence_input]),
    substance_box,
    add_btn,
])

display(form, add_out)

VBox(children=(HBox(children=(Dropdown(description='dimension', options=('risk_disclosure', 'harm', 'adoption'…

Output()

In [None]:
# Persist buffered annotations (append-only)
append_annotations_buffer()

Buffer is empty; nothing to write.


In [42]:
# Quick sanity check: show last annotations and last JSONL lines
import pandas as pd
from pathlib import Path

if ANNOTATIONS_PARQUET.exists():
    df_check = pd.read_parquet(ANNOTATIONS_PARQUET)
    print(f"Total annotations: {len(df_check)}")
    display(df_check.tail(5))
else:
    print(f"Parquet not found: {ANNOTATIONS_PARQUET}")

if ANNOTATIONS_JSONL.exists():
    lines = Path(ANNOTATIONS_JSONL).read_text(encoding="utf-8").splitlines()
    print("\nLast JSONL entries:")
    for line in lines[-5:]:
        print(line)
else:
    print(f"JSONL not found: {ANNOTATIONS_JSONL}")


Total annotations: 0


Unnamed: 0,annotation_id,document_id,company_number,company_name,ticker,lei,cni_sector,year,dimension,label,...,classifier_id,classifier_version,model_name,source,run_id,created_at,annotator,notes,raw_path,source_format



Last JSONL entries:



In [38]:
python3 pipeline/scripts/load_golden_to_db.py --run-id gs-phase1-20260107-143822

SyntaxError: invalid syntax (1021596069.py, line 1)