From 6b7ab0bbdf9f48f8207c7c8e12096dc910c3c6ba Mon Sep 17 00:00:00 2001 From: NiveditJain Date: Fri, 8 May 2026 00:46:58 -0700 Subject: [PATCH 1/6] [luv-310] fix: strip stray trailing ``` fences that break Mintlify MDX parser on RTL READMEs Streaming Sonnet runs of long pages sometimes append an unmatched ``` line to the end of the translation. The Mintlify MDX parser interprets that as opening a code block that consumes everything to EOF, including the wrapping `` on RTL pages, surfacing as "Failed to parse page content at path i18n/README.he.md: Expected a closing tag for
(6:1-6:16)". New `stripStrayTrailingFence` helper in mdx-translator.ts detects the odd-fence-count case and removes only the last unmatched fence, preserving every balanced pair before it. Wired into both `translateMdxPage` and `translateReadme`. Surgical fix to the two already-broken files (`docs/i18n/README.he.md`, `docs/i18n/README.tr.md`) so Mintlify can deploy today without a full re-translate. Six-case unit test in __tests__/scripts/translate-docs/mdx-translator.test.ts. Refs: post-streaming-switch run 25542951106 Co-Authored-By: Claude Opus 4.7 --- CHANGELOG.md | 1 + .../translate-docs/mdx-translator.test.ts | 37 +++++++++++++++++++ docs/i18n/README.he.md | 1 - docs/i18n/README.tr.md | 1 - scripts/translate-docs/mdx-translator.ts | 29 ++++++++++++++- scripts/translate-docs/readme-translator.ts | 8 +++- 6 files changed, 72 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e0deb585..b7e34296 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ## Unreleased ### Fixes +- `scripts/translate-docs/mdx-translator.ts`: new `stripStrayTrailingFence` helper, wired into both `translateMdxPage` and `translateReadme`, drops a stray trailing ` ``` ` line that streamed Sonnet runs of long pages sometimes append. The unmatched fence opens a code block that consumes everything to EOF — including the wrapping `
` for RTL READMEs — and surfaces in Mintlify as `Failed to parse page content at path i18n/README.he.md: Expected a closing tag for
(6:1-6:16)`. Empirically observed on run 25542951106 (post-streaming-switch #307): `docs/i18n/README.he.md` and `docs/i18n/README.tr.md` both ended with 31 fence-line markers (one stray) instead of the canonical 30; the helper detects the odd-count case and removes only the last unmatched fence, preserving every balanced pair before it. Also strips the stray trailing fence from the two affected files in this commit so Mintlify can deploy without a re-translate. Six-case unit test covers balanced-unchanged, no-fence-unchanged, stray-trailing-after-balanced-pair, lone-fence, embedded-non-fence-mid-line, and language-tagged pairs (#308). - `scripts/translate-docs/translator.ts`: switch `translateContent` from `anthropic.messages.create(...)` to `anthropic.messages.stream(...).finalMessage()` so large Tier-1 (Sonnet) translations don't hit AWS Bedrock's 300 s synchronous `InvokeModel` ceiling. The LiteLLM proxy at `models.aikin.club` routes `claude-sonnet-4-6` weighted 1:1 across `anthropic/claude-sonnet-4-6` and `bedrock/us.anthropic.claude-sonnet-4-6`; under translate-docs load (4 jobs × 4 in-flight = 16 concurrent) any request that lands on Bedrock and runs >300 s is severed by Bedrock and surfaces to the SDK as `APIConnectionError ("Connection error.")` — exactly the symptom that survived #306 (SDK retry bump) and the platform-side `request_timeout: 300 → 600` lift in `exospherehost/platform#345`. Two consecutive matrix runs post-platform-fix ([25540656053](https://github.com/exospherehost/failproofai/actions/runs/25540656053), [25541614351](https://github.com/exospherehost/failproofai/actions/runs/25541614351)) showed the same deterministic failure cohort: the 4 largest pages (`built-in-policies`, `architecture`, `configuration`, `custom-policies`, plus `README`) failing at ~317 s for in-flight slot 1/2 and ~367 s for slots 3/4 — both below the new 600 s ceiling, so the wall isn't ours. `messages.stream(...).finalMessage()` returns the same `Message` shape so the function's public return type is unchanged; Bedrock falls back to `InvokeModelWithResponseStream` (no 300 s wall) and Anthropic-direct supports streaming for the full 10-minute non-streaming budget. SDK `maxRetries: 5`, per-job `MAX_CONCURRENT: 4`, and the platform `request_timeout: 600 s` ceiling all stay as the correct safety bounds; the actual unblock was on the client-side request shape (#307). - `scripts/translate-docs`: bump SDK `maxRetries` from the Anthropic default of 2 to 5 in `translator.ts:getClient` and raise per-job `MAX_CONCURRENT` from 2 to 4 in `cli.ts`, both now env-overridable via `TRANSLATE_MAX_RETRIES` and `TRANSLATE_MAX_CONCURRENT`. The LiteLLM proxy behind `ANTHROPIC_BASE_URL` has been horizontally scaled, so the previous cap of 2 (set in #300 to dodge the gateway's connection-drop cliff at ~2 in flight) now leaves capacity on the floor. The errors that *do* still surface are no longer load-induced — they are per-request transient failures (cold replicas, LB hashing landing on an unhealthy pod, idle-socket TCP resets) where the SDK's default 2-retry budget runs out before the LB can route a retry to a healthy replica, and `Anthropic.APIConnectionError ("Connection error.")` bubbles up. Empirically observed: a `--languages zh --force` re-run (Tier-1 Sonnet, 5 uncached MDX pages) returned 2 successes and 2 `Connection error.` lines under the prior 2/2 setting. Bumping to 5 retries (≈0.5+1+2+4+8 ≈ 15 s of jittered backoff per request, 6 connection attempts total per page) absorbs the transient failures; bumping concurrency to 4 takes back the throughput the prior cap forfeited. CI matrix `max-parallel: 4` is unchanged — the new global ceiling of 4×4 = 16 in flight is still half the failure-mode threshold of 28 from #305 even before accounting for the scale-out, so no workflow change needed (#306). - `.github/workflows/translate-docs.yml`: cap the `translate` matrix at `max-parallel: 4` so the 14-language fan-out can't burst past the LiteLLM proxy's connection-drop knee point. The previous `MAX_CONCURRENT = 2` cap in `scripts/translate-docs/cli.ts` (#300) limited per-job concurrency but not cross-job, so under push-to-main the proxy at `ANTHROPIC_BASE_URL` saw up to 14 jobs × 2 = 28 simultaneous requests and returned `APIConnectionError ("Connection error.")` on most of them — surfaced as a workflow-wide failure on run 25532970192 where all 14 matrix jobs errored. With the cap set to 4, the proxy sees at most 8 in-flight; wall-clock cost is bounded since each job is 4–9 minutes and 14 langs in batches of 4 still completes well inside the workflow's existing footprint. Tier-1 (Sonnet, 7 langs sharing one upstream model_name) is the cohort that hit the cliff hardest; Tier-2/3 (Haiku) had headroom and only the single largest doc page consistently errored (#305). diff --git a/__tests__/scripts/translate-docs/mdx-translator.test.ts b/__tests__/scripts/translate-docs/mdx-translator.test.ts index dc805bd6..2262f4ab 100644 --- a/__tests__/scripts/translate-docs/mdx-translator.test.ts +++ b/__tests__/scripts/translate-docs/mdx-translator.test.ts @@ -3,6 +3,7 @@ import { describe, it, expect } from "vitest"; import { rewriteInternalLinks, sanitizeJsxAttributes, + stripStrayTrailingFence, } from "@/scripts/translate-docs/mdx-translator"; describe("rewriteInternalLinks", () => { @@ -129,3 +130,39 @@ describe("sanitizeJsxAttributes", () => { expect(result).toBe(``); }); }); + +describe("stripStrayTrailingFence", () => { + it("leaves balanced fences untouched", () => { + const input = "intro\n\n```ts\nconst x = 1;\n```\n\noutro\n"; + expect(stripStrayTrailingFence(input)).toBe(input); + }); + + it("returns input unchanged when no fences", () => { + const input = "Just some prose with `inline code` and no fences.\n"; + expect(stripStrayTrailingFence(input)).toBe(input); + }); + + it("strips a stray trailing fence after a balanced pair", () => { + const input = "intro\n\n```ts\nconst x = 1;\n```\n\noutro\n```\n"; + const expected = "intro\n\n```ts\nconst x = 1;\n```\n\noutro\n"; + expect(stripStrayTrailingFence(input)).toBe(expected); + }); + + it("strips the lone fence when there's only one (odd count of one)", () => { + const input = "preamble\n\n```\nuncertain\nepilogue\n"; + expect(stripStrayTrailingFence(input)).toBe( + "preamble\n\nuncertain\nepilogue\n", + ); + }); + + it("only matches fence markers at start of line", () => { + // An inline ``` mid-line is not a fence marker; should not be counted. + const input = "text with embedded ```not-a-fence``` mid-line\n"; + expect(stripStrayTrailingFence(input)).toBe(input); + }); + + it("preserves balanced pairs with language tags", () => { + const input = "```ts\nfoo\n```\n\n```bash\nbar\n```\n"; + expect(stripStrayTrailingFence(input)).toBe(input); + }); +}); diff --git a/docs/i18n/README.he.md b/docs/i18n/README.he.md index 354ed54c..e56449cc 100644 --- a/docs/i18n/README.he.md +++ b/docs/i18n/README.he.md @@ -364,7 +364,6 @@ failproofai policies --install --scope project --- בנוי ותחזוקה מ-**ExosphereHost: Reliability Research Lab for Your Agents**. אנחנו עוזרים לחברות וסטארטאפים לשפר את אמינות סוכני AI שלהם דרך הסוכנים שלנו, תוכנה ומומחיות. למידע נוסף ב-[exosphere.host](https://exosphere.host). -```
\ No newline at end of file diff --git a/docs/i18n/README.tr.md b/docs/i18n/README.tr.md index 275ee3d4..96ad73c9 100644 --- a/docs/i18n/README.tr.md +++ b/docs/i18n/README.tr.md @@ -362,4 +362,3 @@ failproofai policies --install --scope project --- **ExosphereHost: Reliability Research Lab for Your Agents** tarafından oluşturulmuş ve yönetilmektedir. Kurumsal ve başlangıç şirketlerinin kendi aracıları, yazılımı ve uzmanlığı aracılığıyla AI aracılarının güvenilirliğini geliştirmelerine yardımcı oluruz. [exosphere.host](https://exosphere.host) adresinden daha fazla bilgi edinin. -``` diff --git a/scripts/translate-docs/mdx-translator.ts b/scripts/translate-docs/mdx-translator.ts index 65b010f7..7cbf5aae 100644 --- a/scripts/translate-docs/mdx-translator.ts +++ b/scripts/translate-docs/mdx-translator.ts @@ -67,6 +67,29 @@ export function sanitizeJsxAttributes(content: string): string { ); } +/** + * Drop a stray trailing code-fence line that the model sometimes appends to + * the very end of long translations (empirically observed on streamed Sonnet + * runs of large pages, e.g. README.he.md / README.tr.md after the streaming + * switch). Only fires when the total count of fence-lines is odd — the last + * unmatched fence is stripped, preserving every balanced pair before it. + * + * The Mintlify MDX parser interprets an unmatched ``` as opening a code + * block that consumes everything to EOF, including the wrapper `` for + * RTL pages, which surfaces as `Expected a closing tag for
`. + */ +export function stripStrayTrailingFence(content: string): string { + const lines = content.split("\n"); + const fenceLineIndices: number[] = []; + for (let i = 0; i < lines.length; i++) { + if (/^```/.test(lines[i])) fenceLineIndices.push(i); + } + if (fenceLineIndices.length % 2 === 0) return content; + const dropIdx = fenceLineIndices[fenceLineIndices.length - 1]; + lines.splice(dropIdx, 1); + return lines.join("\n"); +} + /** * Rewrite internal doc links to include the language prefix. * e.g. href="/built-in-policies" -> href="/es/built-in-policies" @@ -147,8 +170,10 @@ export async function translateMdxPage( options.model, ); - // Strip stray quote artifacts from JSX attribute values, then rewrite links - const sanitized = sanitizeJsxAttributes(translated); + // Strip stray quote artifacts from JSX attribute values, drop any + // unmatched trailing code fence the model sometimes hallucinates, then + // rewrite links. + const sanitized = stripStrayTrailingFence(sanitizeJsxAttributes(translated)); const withLinks = rewriteInternalLinks(sanitized, lang); // Write output diff --git a/scripts/translate-docs/readme-translator.ts b/scripts/translate-docs/readme-translator.ts index 38bc0b5b..38bc955d 100644 --- a/scripts/translate-docs/readme-translator.ts +++ b/scripts/translate-docs/readme-translator.ts @@ -3,6 +3,7 @@ import { dirname, join } from "node:path"; import { fileURLToPath } from "node:url"; import { LANGUAGES, getLanguageByCode } from "./config"; import { translateContent } from "./translator"; +import { stripStrayTrailingFence } from "./mdx-translator"; import { readCache, writeCache, isCached, setCacheEntry } from "./cache"; import type { TranslationResult, TranslationCache } from "./types"; @@ -102,7 +103,12 @@ export async function translateReadme( const rtlOpen = langConfig.rtl ? `
\n\n` : ""; const rtlClose = langConfig.rtl ? `\n\n
` : ""; - const output = `${disclaimer}\n\n${langSelector}\n\n---\n${rtlOpen}\n${translated}\n${rtlClose}`; + // Drop any stray trailing fence the model hallucinated — would otherwise + // open an unclosed code block that swallows the wrapping `
` for RTL + // pages and break Mintlify's MDX parser. + const cleaned = stripStrayTrailingFence(translated); + + const output = `${disclaimer}\n\n${langSelector}\n\n---\n${rtlOpen}\n${cleaned}\n${rtlClose}`; // Write output mkdirSync(I18N_DIR, { recursive: true }); From 467e97276fe7ac5ed3f621dae3294e7c1b2a552e Mon Sep 17 00:00:00 2001 From: NiveditJain Date: Fri, 8 May 2026 00:54:23 -0700 Subject: [PATCH 2/6] chore: retrigger CI (theme-toggle.test.tsx flake on previous run, unrelated to this PR) Co-Authored-By: Claude Opus 4.7 From 06f9bc65276da359dfa3618deed30336cf97d1e1 Mon Sep 17 00:00:00 2001 From: NiveditJain Date: Fri, 8 May 2026 00:56:42 -0700 Subject: [PATCH 3/6] fix: correct CHANGELOG PR reference 308 -> 311 Address CodeRabbit comment on PR #311. Co-Authored-By: Claude Opus 4.7 --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b7e34296..b7b05227 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ ## Unreleased ### Fixes -- `scripts/translate-docs/mdx-translator.ts`: new `stripStrayTrailingFence` helper, wired into both `translateMdxPage` and `translateReadme`, drops a stray trailing ` ``` ` line that streamed Sonnet runs of long pages sometimes append. The unmatched fence opens a code block that consumes everything to EOF — including the wrapping `` for RTL READMEs — and surfaces in Mintlify as `Failed to parse page content at path i18n/README.he.md: Expected a closing tag for
(6:1-6:16)`. Empirically observed on run 25542951106 (post-streaming-switch #307): `docs/i18n/README.he.md` and `docs/i18n/README.tr.md` both ended with 31 fence-line markers (one stray) instead of the canonical 30; the helper detects the odd-count case and removes only the last unmatched fence, preserving every balanced pair before it. Also strips the stray trailing fence from the two affected files in this commit so Mintlify can deploy without a re-translate. Six-case unit test covers balanced-unchanged, no-fence-unchanged, stray-trailing-after-balanced-pair, lone-fence, embedded-non-fence-mid-line, and language-tagged pairs (#308). +- `scripts/translate-docs/mdx-translator.ts`: new `stripStrayTrailingFence` helper, wired into both `translateMdxPage` and `translateReadme`, drops a stray trailing ` ``` ` line that streamed Sonnet runs of long pages sometimes append. The unmatched fence opens a code block that consumes everything to EOF — including the wrapping `
` for RTL READMEs — and surfaces in Mintlify as `Failed to parse page content at path i18n/README.he.md: Expected a closing tag for
(6:1-6:16)`. Empirically observed on run 25542951106 (post-streaming-switch #307): `docs/i18n/README.he.md` and `docs/i18n/README.tr.md` both ended with 31 fence-line markers (one stray) instead of the canonical 30; the helper detects the odd-count case and removes only the last unmatched fence, preserving every balanced pair before it. Also strips the stray trailing fence from the two affected files in this commit so Mintlify can deploy without a re-translate. Six-case unit test covers balanced-unchanged, no-fence-unchanged, stray-trailing-after-balanced-pair, lone-fence, embedded-non-fence-mid-line, and language-tagged pairs (#311). - `scripts/translate-docs/translator.ts`: switch `translateContent` from `anthropic.messages.create(...)` to `anthropic.messages.stream(...).finalMessage()` so large Tier-1 (Sonnet) translations don't hit AWS Bedrock's 300 s synchronous `InvokeModel` ceiling. The LiteLLM proxy at `models.aikin.club` routes `claude-sonnet-4-6` weighted 1:1 across `anthropic/claude-sonnet-4-6` and `bedrock/us.anthropic.claude-sonnet-4-6`; under translate-docs load (4 jobs × 4 in-flight = 16 concurrent) any request that lands on Bedrock and runs >300 s is severed by Bedrock and surfaces to the SDK as `APIConnectionError ("Connection error.")` — exactly the symptom that survived #306 (SDK retry bump) and the platform-side `request_timeout: 300 → 600` lift in `exospherehost/platform#345`. Two consecutive matrix runs post-platform-fix ([25540656053](https://github.com/exospherehost/failproofai/actions/runs/25540656053), [25541614351](https://github.com/exospherehost/failproofai/actions/runs/25541614351)) showed the same deterministic failure cohort: the 4 largest pages (`built-in-policies`, `architecture`, `configuration`, `custom-policies`, plus `README`) failing at ~317 s for in-flight slot 1/2 and ~367 s for slots 3/4 — both below the new 600 s ceiling, so the wall isn't ours. `messages.stream(...).finalMessage()` returns the same `Message` shape so the function's public return type is unchanged; Bedrock falls back to `InvokeModelWithResponseStream` (no 300 s wall) and Anthropic-direct supports streaming for the full 10-minute non-streaming budget. SDK `maxRetries: 5`, per-job `MAX_CONCURRENT: 4`, and the platform `request_timeout: 600 s` ceiling all stay as the correct safety bounds; the actual unblock was on the client-side request shape (#307). - `scripts/translate-docs`: bump SDK `maxRetries` from the Anthropic default of 2 to 5 in `translator.ts:getClient` and raise per-job `MAX_CONCURRENT` from 2 to 4 in `cli.ts`, both now env-overridable via `TRANSLATE_MAX_RETRIES` and `TRANSLATE_MAX_CONCURRENT`. The LiteLLM proxy behind `ANTHROPIC_BASE_URL` has been horizontally scaled, so the previous cap of 2 (set in #300 to dodge the gateway's connection-drop cliff at ~2 in flight) now leaves capacity on the floor. The errors that *do* still surface are no longer load-induced — they are per-request transient failures (cold replicas, LB hashing landing on an unhealthy pod, idle-socket TCP resets) where the SDK's default 2-retry budget runs out before the LB can route a retry to a healthy replica, and `Anthropic.APIConnectionError ("Connection error.")` bubbles up. Empirically observed: a `--languages zh --force` re-run (Tier-1 Sonnet, 5 uncached MDX pages) returned 2 successes and 2 `Connection error.` lines under the prior 2/2 setting. Bumping to 5 retries (≈0.5+1+2+4+8 ≈ 15 s of jittered backoff per request, 6 connection attempts total per page) absorbs the transient failures; bumping concurrency to 4 takes back the throughput the prior cap forfeited. CI matrix `max-parallel: 4` is unchanged — the new global ceiling of 4×4 = 16 in flight is still half the failure-mode threshold of 28 from #305 even before accounting for the scale-out, so no workflow change needed (#306). - `.github/workflows/translate-docs.yml`: cap the `translate` matrix at `max-parallel: 4` so the 14-language fan-out can't burst past the LiteLLM proxy's connection-drop knee point. The previous `MAX_CONCURRENT = 2` cap in `scripts/translate-docs/cli.ts` (#300) limited per-job concurrency but not cross-job, so under push-to-main the proxy at `ANTHROPIC_BASE_URL` saw up to 14 jobs × 2 = 28 simultaneous requests and returned `APIConnectionError ("Connection error.")` on most of them — surfaced as a workflow-wide failure on run 25532970192 where all 14 matrix jobs errored. With the cap set to 4, the proxy sees at most 8 in-flight; wall-clock cost is bounded since each job is 4–9 minutes and 14 langs in batches of 4 still completes well inside the workflow's existing footprint. Tier-1 (Sonnet, 7 langs sharing one upstream model_name) is the cohort that hit the cliff hardest; Tier-2/3 (Haiku) had headroom and only the single largest doc page consistently errored (#305). From d85b7c818fa500f58fbca0e445c844081d3713f5 Mon Sep 17 00:00:00 2001 From: NiveditJain Date: Fri, 8 May 2026 01:04:39 -0700 Subject: [PATCH 4/6] fix: also strip stray ``` from README.ar.md (regenerated post-streaming on main) The auto-translate run on main (#312) regenerated docs/i18n/README.ar.md with the same trailing-fence bug observed on he/tr. Drop the stray fence so Mintlify can parse this file too. Sanitizer in the script will prevent recurrence on future runs. Co-Authored-By: Claude Opus 4.7 --- docs/i18n/README.ar.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/i18n/README.ar.md b/docs/i18n/README.ar.md index 1243956a..d9d29ee2 100644 --- a/docs/i18n/README.ar.md +++ b/docs/i18n/README.ar.md @@ -364,7 +364,6 @@ failproofai policies --install --scope project --- تم بناؤها والحفاظ عليها بواسطة **ExosphereHost: مختبر أبحاث الموثوقية لوكلائك**. نحن نساعد الشركات والشركات الناشئة على تحسين موثوقية وكلائهم الذكيين من خلال وكلائنا والبرمجيات والخبرة. تعرف على المزيد في [exosphere.host](https://exosphere.host). -```
\ No newline at end of file From 2461a8fa757d6950f803f9e2c82cae167728a5a0 Mon Sep 17 00:00:00 2001 From: NiveditJain Date: Fri, 8 May 2026 01:04:55 -0700 Subject: [PATCH 5/6] docs: extend CHANGELOG note to cover ar README too --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b7b05227..5198ce22 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ ## Unreleased ### Fixes -- `scripts/translate-docs/mdx-translator.ts`: new `stripStrayTrailingFence` helper, wired into both `translateMdxPage` and `translateReadme`, drops a stray trailing ` ``` ` line that streamed Sonnet runs of long pages sometimes append. The unmatched fence opens a code block that consumes everything to EOF — including the wrapping `` for RTL READMEs — and surfaces in Mintlify as `Failed to parse page content at path i18n/README.he.md: Expected a closing tag for
(6:1-6:16)`. Empirically observed on run 25542951106 (post-streaming-switch #307): `docs/i18n/README.he.md` and `docs/i18n/README.tr.md` both ended with 31 fence-line markers (one stray) instead of the canonical 30; the helper detects the odd-count case and removes only the last unmatched fence, preserving every balanced pair before it. Also strips the stray trailing fence from the two affected files in this commit so Mintlify can deploy without a re-translate. Six-case unit test covers balanced-unchanged, no-fence-unchanged, stray-trailing-after-balanced-pair, lone-fence, embedded-non-fence-mid-line, and language-tagged pairs (#311). +- `scripts/translate-docs/mdx-translator.ts`: new `stripStrayTrailingFence` helper, wired into both `translateMdxPage` and `translateReadme`, drops a stray trailing ` ``` ` line that streamed Sonnet runs of long pages sometimes append. The unmatched fence opens a code block that consumes everything to EOF — including the wrapping `
` for RTL READMEs — and surfaces in Mintlify as `Failed to parse page content at path i18n/README.he.md: Expected a closing tag for
(6:1-6:16)`. Empirically observed on run 25542951106 (post-streaming-switch #307): `docs/i18n/README.he.md` and `docs/i18n/README.tr.md` both ended with 31 fence-line markers (one stray) instead of the canonical 30; a subsequent rebase against main found `docs/i18n/README.ar.md` regenerated by the auto-translate workflow (#312) with the same bug. The helper detects the odd-count case and removes only the last unmatched fence, preserving every balanced pair before it. Also strips the stray trailing fence from all three affected files in this commit so Mintlify can deploy without a re-translate. Six-case unit test covers balanced-unchanged, no-fence-unchanged, stray-trailing-after-balanced-pair, lone-fence, embedded-non-fence-mid-line, and language-tagged pairs (#311). - `scripts/translate-docs/translator.ts`: switch `translateContent` from `anthropic.messages.create(...)` to `anthropic.messages.stream(...).finalMessage()` so large Tier-1 (Sonnet) translations don't hit AWS Bedrock's 300 s synchronous `InvokeModel` ceiling. The LiteLLM proxy at `models.aikin.club` routes `claude-sonnet-4-6` weighted 1:1 across `anthropic/claude-sonnet-4-6` and `bedrock/us.anthropic.claude-sonnet-4-6`; under translate-docs load (4 jobs × 4 in-flight = 16 concurrent) any request that lands on Bedrock and runs >300 s is severed by Bedrock and surfaces to the SDK as `APIConnectionError ("Connection error.")` — exactly the symptom that survived #306 (SDK retry bump) and the platform-side `request_timeout: 300 → 600` lift in `exospherehost/platform#345`. Two consecutive matrix runs post-platform-fix ([25540656053](https://github.com/exospherehost/failproofai/actions/runs/25540656053), [25541614351](https://github.com/exospherehost/failproofai/actions/runs/25541614351)) showed the same deterministic failure cohort: the 4 largest pages (`built-in-policies`, `architecture`, `configuration`, `custom-policies`, plus `README`) failing at ~317 s for in-flight slot 1/2 and ~367 s for slots 3/4 — both below the new 600 s ceiling, so the wall isn't ours. `messages.stream(...).finalMessage()` returns the same `Message` shape so the function's public return type is unchanged; Bedrock falls back to `InvokeModelWithResponseStream` (no 300 s wall) and Anthropic-direct supports streaming for the full 10-minute non-streaming budget. SDK `maxRetries: 5`, per-job `MAX_CONCURRENT: 4`, and the platform `request_timeout: 600 s` ceiling all stay as the correct safety bounds; the actual unblock was on the client-side request shape (#307). - `scripts/translate-docs`: bump SDK `maxRetries` from the Anthropic default of 2 to 5 in `translator.ts:getClient` and raise per-job `MAX_CONCURRENT` from 2 to 4 in `cli.ts`, both now env-overridable via `TRANSLATE_MAX_RETRIES` and `TRANSLATE_MAX_CONCURRENT`. The LiteLLM proxy behind `ANTHROPIC_BASE_URL` has been horizontally scaled, so the previous cap of 2 (set in #300 to dodge the gateway's connection-drop cliff at ~2 in flight) now leaves capacity on the floor. The errors that *do* still surface are no longer load-induced — they are per-request transient failures (cold replicas, LB hashing landing on an unhealthy pod, idle-socket TCP resets) where the SDK's default 2-retry budget runs out before the LB can route a retry to a healthy replica, and `Anthropic.APIConnectionError ("Connection error.")` bubbles up. Empirically observed: a `--languages zh --force` re-run (Tier-1 Sonnet, 5 uncached MDX pages) returned 2 successes and 2 `Connection error.` lines under the prior 2/2 setting. Bumping to 5 retries (≈0.5+1+2+4+8 ≈ 15 s of jittered backoff per request, 6 connection attempts total per page) absorbs the transient failures; bumping concurrency to 4 takes back the throughput the prior cap forfeited. CI matrix `max-parallel: 4` is unchanged — the new global ceiling of 4×4 = 16 in flight is still half the failure-mode threshold of 28 from #305 even before accounting for the scale-out, so no workflow change needed (#306). - `.github/workflows/translate-docs.yml`: cap the `translate` matrix at `max-parallel: 4` so the 14-language fan-out can't burst past the LiteLLM proxy's connection-drop knee point. The previous `MAX_CONCURRENT = 2` cap in `scripts/translate-docs/cli.ts` (#300) limited per-job concurrency but not cross-job, so under push-to-main the proxy at `ANTHROPIC_BASE_URL` saw up to 14 jobs × 2 = 28 simultaneous requests and returned `APIConnectionError ("Connection error.")` on most of them — surfaced as a workflow-wide failure on run 25532970192 where all 14 matrix jobs errored. With the cap set to 4, the proxy sees at most 8 in-flight; wall-clock cost is bounded since each job is 4–9 minutes and 14 langs in batches of 4 still completes well inside the workflow's existing footprint. Tier-1 (Sonnet, 7 langs sharing one upstream model_name) is the cohort that hit the cliff hardest; Tier-2/3 (Haiku) had headroom and only the single largest doc page consistently errored (#305). From 5a6813e262d6a3aa990d6c06882d4b221682f651 Mon Sep 17 00:00:00 2001 From: NiveditJain Date: Fri, 8 May 2026 01:07:52 -0700 Subject: [PATCH 6/6] fix: tighten fence regex to exactly 3 backticks; correct CHANGELOG PR ref to #313 Address CodeRabbit findings on PR #313: 1. /^```/ matched 4-tick fence markers too, which would miscount inner ``` content inside a quad-tick block as a fence marker. Tightened to /^```(?!`)/ so only exactly-three-backtick lines are counted. 2. CHANGELOG entry referenced #311 (the predecessor of this PR); updated to #313. Added regression test for the quad-tick edge case. Co-Authored-By: Claude Opus 4.7 --- CHANGELOG.md | 2 +- __tests__/scripts/translate-docs/mdx-translator.test.ts | 7 +++++++ scripts/translate-docs/mdx-translator.ts | 5 ++++- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5198ce22..3f24e68d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ ## Unreleased ### Fixes -- `scripts/translate-docs/mdx-translator.ts`: new `stripStrayTrailingFence` helper, wired into both `translateMdxPage` and `translateReadme`, drops a stray trailing ` ``` ` line that streamed Sonnet runs of long pages sometimes append. The unmatched fence opens a code block that consumes everything to EOF — including the wrapping `
` for RTL READMEs — and surfaces in Mintlify as `Failed to parse page content at path i18n/README.he.md: Expected a closing tag for
(6:1-6:16)`. Empirically observed on run 25542951106 (post-streaming-switch #307): `docs/i18n/README.he.md` and `docs/i18n/README.tr.md` both ended with 31 fence-line markers (one stray) instead of the canonical 30; a subsequent rebase against main found `docs/i18n/README.ar.md` regenerated by the auto-translate workflow (#312) with the same bug. The helper detects the odd-count case and removes only the last unmatched fence, preserving every balanced pair before it. Also strips the stray trailing fence from all three affected files in this commit so Mintlify can deploy without a re-translate. Six-case unit test covers balanced-unchanged, no-fence-unchanged, stray-trailing-after-balanced-pair, lone-fence, embedded-non-fence-mid-line, and language-tagged pairs (#311). +- `scripts/translate-docs/mdx-translator.ts`: new `stripStrayTrailingFence` helper, wired into both `translateMdxPage` and `translateReadme`, drops a stray trailing ` ``` ` line that streamed Sonnet runs of long pages sometimes append. The unmatched fence opens a code block that consumes everything to EOF — including the wrapping `
` for RTL READMEs — and surfaces in Mintlify as `Failed to parse page content at path i18n/README.he.md: Expected a closing tag for
(6:1-6:16)`. Empirically observed on run 25542951106 (post-streaming-switch #307): `docs/i18n/README.he.md` and `docs/i18n/README.tr.md` both ended with 31 fence-line markers (one stray) instead of the canonical 30; a subsequent rebase against main found `docs/i18n/README.ar.md` regenerated by the auto-translate workflow (#312) with the same bug. The helper detects the odd-count case and removes only the last unmatched fence, preserving every balanced pair before it. Also strips the stray trailing fence from all three affected files in this commit so Mintlify can deploy without a re-translate. Six-case unit test covers balanced-unchanged, no-fence-unchanged, stray-trailing-after-balanced-pair, lone-fence, embedded-non-fence-mid-line, and language-tagged pairs (#313). - `scripts/translate-docs/translator.ts`: switch `translateContent` from `anthropic.messages.create(...)` to `anthropic.messages.stream(...).finalMessage()` so large Tier-1 (Sonnet) translations don't hit AWS Bedrock's 300 s synchronous `InvokeModel` ceiling. The LiteLLM proxy at `models.aikin.club` routes `claude-sonnet-4-6` weighted 1:1 across `anthropic/claude-sonnet-4-6` and `bedrock/us.anthropic.claude-sonnet-4-6`; under translate-docs load (4 jobs × 4 in-flight = 16 concurrent) any request that lands on Bedrock and runs >300 s is severed by Bedrock and surfaces to the SDK as `APIConnectionError ("Connection error.")` — exactly the symptom that survived #306 (SDK retry bump) and the platform-side `request_timeout: 300 → 600` lift in `exospherehost/platform#345`. Two consecutive matrix runs post-platform-fix ([25540656053](https://github.com/exospherehost/failproofai/actions/runs/25540656053), [25541614351](https://github.com/exospherehost/failproofai/actions/runs/25541614351)) showed the same deterministic failure cohort: the 4 largest pages (`built-in-policies`, `architecture`, `configuration`, `custom-policies`, plus `README`) failing at ~317 s for in-flight slot 1/2 and ~367 s for slots 3/4 — both below the new 600 s ceiling, so the wall isn't ours. `messages.stream(...).finalMessage()` returns the same `Message` shape so the function's public return type is unchanged; Bedrock falls back to `InvokeModelWithResponseStream` (no 300 s wall) and Anthropic-direct supports streaming for the full 10-minute non-streaming budget. SDK `maxRetries: 5`, per-job `MAX_CONCURRENT: 4`, and the platform `request_timeout: 600 s` ceiling all stay as the correct safety bounds; the actual unblock was on the client-side request shape (#307). - `scripts/translate-docs`: bump SDK `maxRetries` from the Anthropic default of 2 to 5 in `translator.ts:getClient` and raise per-job `MAX_CONCURRENT` from 2 to 4 in `cli.ts`, both now env-overridable via `TRANSLATE_MAX_RETRIES` and `TRANSLATE_MAX_CONCURRENT`. The LiteLLM proxy behind `ANTHROPIC_BASE_URL` has been horizontally scaled, so the previous cap of 2 (set in #300 to dodge the gateway's connection-drop cliff at ~2 in flight) now leaves capacity on the floor. The errors that *do* still surface are no longer load-induced — they are per-request transient failures (cold replicas, LB hashing landing on an unhealthy pod, idle-socket TCP resets) where the SDK's default 2-retry budget runs out before the LB can route a retry to a healthy replica, and `Anthropic.APIConnectionError ("Connection error.")` bubbles up. Empirically observed: a `--languages zh --force` re-run (Tier-1 Sonnet, 5 uncached MDX pages) returned 2 successes and 2 `Connection error.` lines under the prior 2/2 setting. Bumping to 5 retries (≈0.5+1+2+4+8 ≈ 15 s of jittered backoff per request, 6 connection attempts total per page) absorbs the transient failures; bumping concurrency to 4 takes back the throughput the prior cap forfeited. CI matrix `max-parallel: 4` is unchanged — the new global ceiling of 4×4 = 16 in flight is still half the failure-mode threshold of 28 from #305 even before accounting for the scale-out, so no workflow change needed (#306). - `.github/workflows/translate-docs.yml`: cap the `translate` matrix at `max-parallel: 4` so the 14-language fan-out can't burst past the LiteLLM proxy's connection-drop knee point. The previous `MAX_CONCURRENT = 2` cap in `scripts/translate-docs/cli.ts` (#300) limited per-job concurrency but not cross-job, so under push-to-main the proxy at `ANTHROPIC_BASE_URL` saw up to 14 jobs × 2 = 28 simultaneous requests and returned `APIConnectionError ("Connection error.")` on most of them — surfaced as a workflow-wide failure on run 25532970192 where all 14 matrix jobs errored. With the cap set to 4, the proxy sees at most 8 in-flight; wall-clock cost is bounded since each job is 4–9 minutes and 14 langs in batches of 4 still completes well inside the workflow's existing footprint. Tier-1 (Sonnet, 7 langs sharing one upstream model_name) is the cohort that hit the cliff hardest; Tier-2/3 (Haiku) had headroom and only the single largest doc page consistently errored (#305). diff --git a/__tests__/scripts/translate-docs/mdx-translator.test.ts b/__tests__/scripts/translate-docs/mdx-translator.test.ts index 2262f4ab..59bce5f3 100644 --- a/__tests__/scripts/translate-docs/mdx-translator.test.ts +++ b/__tests__/scripts/translate-docs/mdx-translator.test.ts @@ -165,4 +165,11 @@ describe("stripStrayTrailingFence", () => { const input = "```ts\nfoo\n```\n\n```bash\nbar\n```\n"; expect(stripStrayTrailingFence(input)).toBe(input); }); + + it("does not count quad-tick fence markers as triple-tick fences", () => { + // 4-tick block ```` ... ```` legally contains ``` as content; only the + // 4-tick lines are real markers and should not be counted by the helper. + const input = "````\ninner ``` content\n````\n"; + expect(stripStrayTrailingFence(input)).toBe(input); + }); }); diff --git a/scripts/translate-docs/mdx-translator.ts b/scripts/translate-docs/mdx-translator.ts index 7cbf5aae..25bb1d18 100644 --- a/scripts/translate-docs/mdx-translator.ts +++ b/scripts/translate-docs/mdx-translator.ts @@ -82,7 +82,10 @@ export function stripStrayTrailingFence(content: string): string { const lines = content.split("\n"); const fenceLineIndices: number[] = []; for (let i = 0; i < lines.length; i++) { - if (/^```/.test(lines[i])) fenceLineIndices.push(i); + // Match exactly three backticks at start of line — `^```(?!`)` excludes + // longer-fence markers (```` etc.) so an inner ``` *inside* a quad-tick + // block isn't miscounted as a marker. + if (/^```(?!`)/.test(lines[i])) fenceLineIndices.push(i); } if (fenceLineIndices.length % 2 === 0) return content; const dropIdx = fenceLineIndices[fenceLineIndices.length - 1];