ContextLab · jeremymanning · May 28, 2026 · May 28, 2026 · May 28, 2026 · May 28, 2026
diff --git a/.github/workflows/spec015-calibration.yml b/.github/workflows/spec015-calibration.yml
@@ -0,0 +1,149 @@
+name: spec015 — differential calibration
+
+# Runs `scripts/run_calibration.py` against the on-disk calibration set
+# and commits the produced adjudication reports.
+#
+# Triggers:
+# - `workflow_dispatch` — manual trigger from the GitHub Actions UI;
+#   the maintainer picks a stage + domain label.
+# - `schedule` (commented out by default) — weekly automatic runs once
+#   the maintainer is comfortable letting it auto-run.
+#
+# Permissions: needs `contents: write` to commit the report back to the
+# calibration reports directory.
+
+on:
+  workflow_dispatch:
+    inputs:
+      stage:
+        description: 'Which stage to calibrate (spec/plan/tasks/paper_spec/paper_plan/paper_tasks/paper_implement/all)'
+        required: true
+        default: 'spec'
+        type: choice
+        options:
+          - spec
+          - plan
+          - tasks
+          - paper_spec
+          - paper_plan
+          - paper_tasks
+          - paper_implement
+          - all
+      domain:
+        description: 'Domain label written into the report header (or "(unspecified)")'
+        required: false
+        default: '(unspecified)'
+        type: string
+      max_tokens:
+        description: 'Per-call max_tokens for the reasoning model (default 131072 = 128K; qwen3.5-122b has a 256K context window so this leaves ample room for input + reasoning)'
+        required: false
+        default: '131072'
+        type: string
+  # Uncomment to run weekly once the workflow is trusted:
+  # schedule:
+  #   - cron: '0 6 * * 0'  # 06:00 UTC Sundays
+
+jobs:
+  calibrate:
+    runs-on: ubuntu-latest
+    timeout-minutes: 90
+    permissions:
+      contents: write
+    steps:
+      - name: Check out repo
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Set up Python 3.11
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+          cache: 'pip'
+
+      - name: Install package (with dev extras)
+        run: |
+          python -m pip install --upgrade pip
+          pip install -e '.[dev]'
+
+      - name: Run differential calibration
+        env:
+          DARTMOUTH_CHAT_API_KEY: ${{ secrets.DARTMOUTH_CHAT_API_KEY }}
+          # Inputs piped via env (safer than direct ${{ ... }} expansion
+          # in run: blocks — workflow_dispatch is write-access-only but
+          # we follow the same hardening as untrusted-input workflows).
+          STAGE: ${{ inputs.stage }}
+          DOMAIN: ${{ inputs.domain }}
+          MAX_TOKENS: ${{ inputs.max_tokens }}
+        run: |
+          mkdir -p specs/015-pipeline-convergence-protocol/calibration/reports
+          python scripts/run_calibration.py \
+            --stage "$STAGE" \
+            --domain "$DOMAIN" \
+            --max-tokens "$MAX_TOKENS" \
+            2>&1 | tee calibration-run.log
+
+      # Upload the produced report (+ run log) as an artifact BEFORE
+      # attempting any git commit. Calibration runs are expensive (~25 min);
+      # a race-condition push failure shouldn't lose the output.
+      - name: Upload calibration outputs as artifact
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: spec015-calibration-output
+          path: |
+            calibration-run.log
+            specs/015-pipeline-convergence-protocol/calibration/reports/
+
+      - name: Commit + push the report
+        if: always()
+        env:
+          STAGE: ${{ inputs.stage }}
+          DOMAIN: ${{ inputs.domain }}
+          MAX_TOKENS: ${{ inputs.max_tokens }}
+        run: |
+          git config user.name 'spec015-calibration-bot'
+          git config user.email 'spec015-calibration-bot@users.noreply.github.com'
+          git add specs/015-pipeline-convergence-protocol/calibration/reports/ \
+                  calibration-run.log || true
+          if git diff --cached --quiet; then
+            echo "No new report to commit."
+            exit 0
+          fi
+          TIMESTAMP="$(date -u +%Y%m%dT%H%M%SZ)"
+          git commit -m "calib(015): ${STAGE} run (${TIMESTAMP}) (#239)
+
+          Triggered via workflow_dispatch with:
+            stage=${STAGE}
+            domain=${DOMAIN}
+            max_tokens=${MAX_TOKENS}
+
+          Maintainer: review the produced report under
+          specs/015-pipeline-convergence-protocol/calibration/reports/
+          and fill in the adjudication checklist per FR-046.
+
+          Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>"
+
+          # Race-condition handling: the calibration step takes ~25 min,
+          # during which other commits may have landed on the branch. Pull
+          # --rebase to replay our single commit on top, then push. Retry
+          # up to 3 times in case multiple concurrent runs are competing.
+          BRANCH="${GITHUB_REF##*/}"
+          for attempt in 1 2 3; do
+            echo "::group::Push attempt ${attempt}"
+            git fetch origin "${BRANCH}"
+            if git pull --rebase origin "${BRANCH}"; then
+              if git push origin "HEAD:${BRANCH}"; then
+                echo "::endgroup::"
+                echo "Pushed on attempt ${attempt}."
+                exit 0
+              fi
+            fi
+            echo "::endgroup::"
+            echo "Attempt ${attempt} failed; sleeping before retry."
+            sleep $((attempt * 5))
+          done
+          echo "::error::Could not push the calibration report after 3 attempts."
+          echo "The report artifact has been uploaded above; download from"
+          echo "the workflow's Artifacts section."
+          exit 1
diff --git a/.gitignore b/.gitignore
@@ -243,6 +243,7 @@ Temporary Items
 # transient in-progress sentinels and any local runtime caches.
 state/run-log/*/in-progress/
 state/run-log/*/.invalid/
+state/grounding-cache/
 .specify/cache/
 
 # Multi-secret env variants used by Dartmouth + HF
@@ -296,3 +297,8 @@ state/audit/pdf/*/screenshots/
 # demand keyed by sha256 of chunk bytes.
 projects/*/paper/.chunk_summaries/
 
+
+# Local agent/runtime state (not part of the repo)
+.omc/
+.summaries/
+.claude/scheduled_tasks.lock
diff --git a/.specify/feature.json b/.specify/feature.json
@@ -1 +1,3 @@
-{"feature_directory": "specs/014-phase4-plan-tasks-testing"}
+{
+  "feature_directory": "specs/015-pipeline-convergence-protocol"
+}
diff --git a/.specify/memory/constitution.md b/.specify/memory/constitution.md
@@ -1,10 +1,11 @@
 <!--
 SYNC IMPACT REPORT
 ==================
-Version change: (uninitialized template) → 1.0.0
-Rationale: Initial ratification of the llmXive constitution. MAJOR bump from
-template-only state (no prior numbered version) to first formally adopted
-governance document.
+Version change: (uninitialized template) → 1.0.0 → 1.1.0
+Rationale: 1.0.0 = initial ratification (MAJOR bump from template-only state).
+1.1.0 (2026-05-27, spec 015 / issue #239) = MINOR — added Principle VI
+(Convergent Review, NON-NEGOTIABLE) and replaced the point-based "Review
+thresholds" quality gate with unanimous-panel convergence + advisory triage.
 
 Modified principles:
   - [PRINCIPLE_1_NAME] → I. Single Source of Truth (NON-NEGOTIABLE)
@@ -170,6 +171,34 @@ of compute and API spend. Up-front validation is cheap; late failure is
 expensive. This principle protects both the user's time and the project's
 budget.
 
+### VI. Convergent Review (NON-NEGOTIABLE)
+
+Every step that produces reviewable work MUST run a disciplined
+**identify → revise → re-review** convergence cycle driven by that step's
+review panel: each panelist raises structured critical concerns (R1); the
+step's reviser addresses every concern and emits a per-concern change-log
+(R2); each panelist re-judges its own concerns against the change-log (R3).
+A step's gate is **unanimous acceptance by its LLM review panel** within a
+3-round per-step cap; on non-convergence the project is **kicked back** to the
+appropriate prior stage carrying full provenance (the unresolved concerns +
+links to all artifacts/reviews + a plain-language "why it failed to converge").
+
+There is NO accumulated point system: a panel either unanimously accepts, or
+the work is revised and re-reviewed (kickbacks allowed, with no global cap —
+each cycle is expected to improve the artifact until it converges). Human and
+simulated-personality reviews are **advisory inputs** — routed through a
+stage-aware triage (quality + safety + on-topic) to the matching panelist —
+and never directly gate advancement. Mechanical scaffolding, dispatch, and
+maintenance steps are exempt. Convergence MUST be reported honestly: a step is
+never recorded as passed/converged when its panel has not unanimously accepted.
+
+**Rationale**: Most pipeline steps historically advanced with no critique at
+all, and the one self-critique loop never honestly converged (it masked
+non-convergence as "passed"). A single SSoT convergence protocol makes quality
+a function of disciplined revision rather than accumulated points, gives a
+reasonable idea a convergent path to publication, and rejects work that cannot
+converge — always with honest, inspectable provenance.
+
 ## Additional Constraints & Operational Standards
 
 The following operational standards apply to all work in this repository:
@@ -220,10 +249,13 @@ released:
 - **Reference validation**: Before any paper or user-facing document is
   considered complete, every cited reference MUST be downloaded and reviewed
   per Principle II.
-- **Review thresholds**: Status advancement (Backlog → Ready → In Progress
-  → Done) follows the point-based review system documented in the project
-  README; LLM reviews count 0.5 points and human reviews count 1 point. The
-  documented minimums MUST be met before transition.
+- **Convergence gate (review model)**: Status advancement (Backlog → Ready →
+  In Progress → Done) is governed by the convergence protocol (Principle VI),
+  NOT a point system. A reviewable step advances iff its LLM review panel
+  unanimously accepts within the 3-round per-step cap; otherwise the project is
+  kicked back with full provenance. Human and simulated-personality reviews are
+  advisory inputs via stage-aware triage, never points. (Supersedes the prior
+  0.5/1.0-point thresholds — spec 015 / #239.)
 
 ## Governance
 
@@ -261,4 +293,4 @@ reasons. Unjustified violations block merge.
 here, contributors should consult the project `README.md` and the
 repository-level `CLAUDE.md`.
 
-**Version**: 1.0.0 | **Ratified**: 2026-04-28 | **Last Amended**: 2026-04-28
+**Version**: 1.1.0 | **Ratified**: 2026-04-28 | **Last Amended**: 2026-05-27
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -7,9 +7,9 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
 llmXive is an automated system for scientific discovery driven by LLMs with occasional human input. It's structured as a project management platform with five main task categories, each linked to GitHub issues:
 
 1. **Backlog**: Brainstormed ideas requiring development
-2. **Ready**: Ideas with technical design documents reviewed by ≥10 LLMs or ≥5 human scientists
+2. **Ready**: Ideas whose technical design documents pass **unanimous LLM-panel acceptance** within the 3-round convergence cap (per spec 015 — supersedes the prior "≥10 LLMs / ≥5 humans" point-based threshold)
 3. **In Progress**: Ideas with vetted implementation plans ready for execution
-4. **Reviews**: Formal reviews of designs, implementations, papers, and code
+4. **Reviews**: Formal reviews of designs, implementations, papers, and code (advisory only — human/personality reviews route through stage-aware triage and never directly gate advancement)
 5. **Done**: Completed projects with associated papers
 
 ## Repository Architecture
@@ -29,7 +29,7 @@ Each directory contains a README.md with specific tables tracking projects, cont
 
 ### Project Status Management
 - Projects move through: Backlog → Ready → In Progress → Done
-- Each stage requires specific point thresholds (LLM reviews = 0.5 points, human reviews = 1 point)
+- Each reviewable stage runs identify→revise→re-review with its LLM panel; advancement requires **unanimous panel acceptance** within 3 rounds (else adaptive kickback to the prior stage). The legacy 0.5/1.0 review-point system has been removed (spec 015).
 - Status is tracked via GitHub issue labels and project board columns
 
 ### Documentation Standards
@@ -47,7 +47,7 @@ Each directory contains a README.md with specific tables tracking projects, cont
 ### Review Process
 - Review files named as: `author__MM-DD-YYYY__type.md` (type: A=automatic, M=manual)
 - Reviews organized in subdirectories: Design/Implementation/Paper/Code
-- Minimum review thresholds must be met before status advancement
+- Advancement requires **unanimous LLM-panel acceptance** within the 3-round convergence cap (spec 015); human and simulated-personality reviews are advisory inputs only, routed through stage-aware triage before reaching a panelist
 
 ## Common Development Tasks
 
@@ -64,11 +64,11 @@ Since this is primarily a research documentation repository without traditional
 - Always use absolute paths when referencing files across directories
 - Maintain the table structures in README files when adding new entries
 - Verify all external links and references before committing
-- Follow the point-based review system for project advancement
+- Follow the convergence-based review model (unanimous LLM-panel acceptance within the 3-round cap; spec 015) for project advancement; do NOT re-introduce accumulated 0.5/1.0 review points
 - Use GitHub issues for all project tracking and communication
 
 <!-- SPECKIT START -->
 For additional context about technologies to be used, project structure,
 shell commands, and other important information, read the current plan:
-[specs/014-phase4-plan-tasks-testing/plan.md](specs/014-phase4-plan-tasks-testing/plan.md).
+[specs/015-pipeline-convergence-protocol/plan.md](specs/015-pipeline-convergence-protocol/plan.md).
 <!-- SPECKIT END -->
diff --git a/README.md b/README.md
@@ -22,10 +22,15 @@ validator) → `specified` → `clarified` → `planned` → `tasked` (+ analyze
 `in progress` (the implementer writes code, runs real tests, collects data; the
 librarian verifies citations) → `research review`.
 
-Research review needs **both** a points threshold **and** an accept verdict from
-**every** specialist reviewer in the lane — seven of them: idea quality,
-creativity, implementation correctness, completeness, code quality, data
-quality, filesystem hygiene.
+Research review (spec 015 / #239) runs as an **identify → revise → re-review**
+convergence loop driven by the 8-reviewer panel (idea quality, creativity,
+implementation correctness, completeness, code quality, data quality,
+filesystem hygiene, plus the generic research reviewer). Each panelist raises
+critical concerns; the implementer addresses every concern with a per-concern
+change-log; each panelist re-judges its own concerns. The gate is **unanimous
+panel acceptance** within the 3-round cap; otherwise the project is **kicked
+back** to the appropriate prior stage (adaptive by worst unresolved severity)
+carrying full provenance. There is no accumulated point system.
 
 ### The paper pipeline
 
@@ -41,12 +46,19 @@ specialist** (against the live artifact hash — stale reviews are ignored).
 
 Three terminal outcomes:
 
-- **All specialists accept** → `paper_accepted` → the `paper_publisher`
-  agent (spec 013) pre-reserves a Zenodo DOI, recompiles the PDF with
-  the final `\paperstatus{Auto-Reviewed | Auto-Revised | Published}`
-  byline + DOI + volume/issue, uploads to Zenodo, appends the
-  post-paper appendix (spacer + reviews + revision changelog), writes
-  `paper/publication.yaml`, and transitions to `posted`.
+- **All specialists accept** → `paper_accepted` →
+  `awaiting_publication_signoff`. The transition through to `posted`
+  requires a maintainer to record explicit approval via
+  `llmxive project publish-approve <PROJ-ID>` (spec 015 FR-054 — every
+  real Zenodo DOI mint gated on a manual sign-off). Once approved, the
+  `paper_publisher` agent (spec 013) pre-reserves a Zenodo DOI,
+  recompiles the PDF with the final
+  `\paperstatus{Auto-Reviewed | Auto-Revised | Published}` byline +
+  DOI + volume/issue, uploads to Zenodo, appends the post-paper
+  appendix (spacer + reviews + revision changelog), writes
+  `paper/publication.yaml`, and transitions to `posted`. The graph and
+  the publisher BOTH enforce the sign-off check (defense in depth — no
+  DOI is ever minted without a recorded approval).
 - **Any `fatal` severity** → `brainstormed` (back to the backlog), with a
   rejection rationale appended to the idea record citing each fatal item.
 - **Otherwise** (writing/science items, no fatal) → `paper_revision_in_progress`,
@@ -76,7 +88,10 @@ The twelve specialist reviewers (writing quality, logical consistency,
 claim accuracy, over-reach, safety/ethics, scientific evidence,
 statistical analysis, code quality, data quality, text formatting,
 figure critic, jargon police) each emit action items in their lane.
-Human reviews count double; self-review is rejected by the schema.
+Human and simulated-personality reviews are **advisory inputs**, routed
+through a stage-aware triage (quality + safety + on-topic filters) to the
+matching LLM reviewer's lens — they inform a reviewer's verdict but never
+directly gate advancement. Self-review is rejected by the schema.
 
 arXiv-submitted papers (third-party, source frozen) skip the writing-
 revision pipeline. Instead the consolidated action items land in
@@ -142,8 +157,9 @@ run-log entry.
 
 All inference runs on free backends: Dartmouth's
 [Discovery cluster](https://rc.dartmouth.edu/ai/computing-resources/discovery-cluster/)
-(primary), [Hugging Face](https://huggingface.co/) (fallback), and local
-transformers (last resort). Long, complex tasks (planning, paper writing, deep
+(primary) and local [transformers](https://huggingface.co/docs/transformers)
+(fallback) — open-weight Hugging Face models run locally, no API token.
+Long, complex tasks (planning, paper writing, deep
 review) go to **Qwen 3.5 122B**; faster classification-shaped tasks (clarifying
 questions, triage, quick judgments) go to **Gemma 3 27B**. No paid services
 (Constitution Principle IV — free-first).
@@ -176,7 +192,7 @@ never duplicates data, it derives it.
   feedback; the `submission_intake` agent (hourly cron) triages it to the right
   pipeline step.
 - **Review existing content** — sign in with GitHub and add a verdict on a
-  project's spec, plan, code, data, or paper. Human reviews count double.
+  project's spec, plan, code, data, or paper. Human reviews are advisory inputs (triaged + routed to the matching LLM reviewer's lens, never a gate).
 - **Explore the pipeline / agent registry** — the About page's pipeline diagram
   and "Agent registry" button open in-place modals with each step's
   inputs/outputs/agents/examples and each agent's prompt + tools.
@@ -213,6 +229,8 @@ python -m llmxive preflight                 # fail-fast environment check
 python -m llmxive brainstorm -n 5           # seed 5 brainstormed ideas
 python -m llmxive run --max-tasks 5         # run one scheduled pipeline pass
 python -m llmxive submissions process       # triage open human-submission issues
+python -m llmxive project publish-approve PROJ-001 \
+    --who 'Maintainer Name' --what 'reviewed paper meets standards'  # spec 015 FR-054
 python -m llmxive agents run --agent <name> --project <PROJ-ID>
 ```
 
@@ -222,8 +240,8 @@ research/paper stages, `python -m llmxive submissions process` for the website
 intake, and `Deploy Pages` to publish `web/` → `docs/`.
 
 LLM calls need a Dartmouth Chat API key (`DARTMOUTH_CHAT_API_KEY`, or
-`python -m llmxive auth set`); without it the backends fall through to Hugging
-Face (`HF_TOKEN`) then local transformers.
+`python -m llmxive auth set`); without it the backends fall through to local
+transformers (open-weight Hugging Face models run locally; no token required).
 
 ### Audit tools (spec 010)
 
@@ -278,7 +296,7 @@ About page):
 3. **Provide feedback** — leave feedback on any artifact; it's triaged within
    the hour.
 4. **Review existing content** — add a human review on a project at a review
-   stage. Human reviews count double.
+   stage. Human reviews are advisory inputs (triaged + routed to the matching LLM reviewer's lens, never a gate).
 
 ## License