From ba6cce260ff13958433a77650ac45461b90fa8ef Mon Sep 17 00:00:00 2001 From: anandgupta42 Date: Wed, 3 Jun 2026 16:27:34 -0700 Subject: [PATCH 1/2] fix: two tests flaky under parallel CI load (S27 + trace snapshot) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both pass locally but fail consistently in CI's heavy parallel run (9474 tests / 378 files) — the repo's "no flaky tests under resource contention" case. Neither is caused by any feature change; they fail identically on unrelated PRs (#854/#858/#863), blocking all of them. - `real-tool-simulation` S27: the progressive-suggestion dedup state is a module-global Set. The test's `beforeEach` reset used a dynamic `await import`, which under parallel CI can resolve to a different module instance than the tool's static import — so the real Set is never reset and accumulates `sql_analyze` from S25/S26 → S27 sees no suggestion. Fix: import `PostConnectSuggestions` statically (same instance the tools use); reset in S27 too. - `tracing-adversarial-snapshot` "shows 'running' status": waited a fixed 50ms for a debounced async snapshot write, too short under CI load → read a stale snapshot. Fix: poll the on-disk status until expected (timeout 4s) instead of a fixed sleep. Closes #879 Co-Authored-By: Claude Opus 4.8 (1M context) --- .../tracing-adversarial-snapshot.test.ts | 32 ++++++++++++++----- .../test/session/real-tool-simulation.test.ts | 7 +++- 2 files changed, 30 insertions(+), 9 deletions(-) diff --git a/packages/opencode/test/altimate/tracing-adversarial-snapshot.test.ts b/packages/opencode/test/altimate/tracing-adversarial-snapshot.test.ts index cfe61840f6..02922b50b2 100644 --- a/packages/opencode/test/altimate/tracing-adversarial-snapshot.test.ts +++ b/packages/opencode/test/altimate/tracing-adversarial-snapshot.test.ts @@ -37,6 +37,26 @@ const ZERO_STEP = { tokens: { input: 0, output: 0, reasoning: 0, cache: { read: 0, write: 0 } }, } +// Poll the on-disk snapshot until it reaches `expected` status, instead of a +// single fixed sleep. Snapshot writes are debounced/async, so a hardcoded delay +// is too short under heavy parallel CI load (the snapshot hasn't flushed yet) → +// flaky reads of a stale status. Polling is robust regardless of machine load. +async function pollStatus(tracer: { getTracePath(): string | undefined }, expected: string, timeoutMs = 4000) { + const start = Date.now() + let last = "" + while (Date.now() - start < timeoutMs) { + try { + const snap = JSON.parse(await fs.readFile(tracer.getTracePath()!, "utf-8")) as TraceFile + last = snap.summary.status + if (last === expected) return snap + } catch { + /* file mid-write or not yet created — keep polling */ + } + await new Promise((r) => setTimeout(r, 25)) + } + throw new Error(`timed out after ${timeoutMs}ms waiting for status '${expected}' (last seen '${last}')`) +} + // --------------------------------------------------------------------------- // 1. buildTraceFile — snapshot isolation from mutations // --------------------------------------------------------------------------- @@ -110,8 +130,6 @@ describe("buildTraceFile — snapshot isolation", () => { test("buildTraceFile shows 'running' status during active generation", async () => { const tracer = Recap.withExporters([new FileExporter(tmpDir)]) tracer.startTrace("s-running", { prompt: "test" }) - // Wait for initial snapshot to complete - await new Promise((r) => setTimeout(r, 50)) tracer.logStepStart({ id: "1" }) tracer.logToolCall({ @@ -120,15 +138,13 @@ describe("buildTraceFile — snapshot isolation", () => { state: { status: "completed", input: {}, output: "ok", time: { start: 1, end: 2 } }, }) - // Wait for snapshot — should show "running" since generation is in progress - await new Promise((r) => setTimeout(r, 50)) - const snap = JSON.parse(await fs.readFile(tracer.getTracePath()!, "utf-8")) as TraceFile + // Snapshot should show "running" since generation is in progress. + const snap = await pollStatus(tracer, "running") expect(snap.summary.status).toBe("running") - // After finishing generation, should show "completed" + // After finishing generation, should show "completed". tracer.logStepFinish(ZERO_STEP) - await new Promise((r) => setTimeout(r, 50)) - const snap2 = JSON.parse(await fs.readFile(tracer.getTracePath()!, "utf-8")) as TraceFile + const snap2 = await pollStatus(tracer, "completed") expect(snap2.summary.status).toBe("completed") await tracer.endTrace() diff --git a/packages/opencode/test/session/real-tool-simulation.test.ts b/packages/opencode/test/session/real-tool-simulation.test.ts index 3b21cc2e23..1523b8b861 100644 --- a/packages/opencode/test/session/real-tool-simulation.test.ts +++ b/packages/opencode/test/session/real-tool-simulation.test.ts @@ -12,6 +12,11 @@ import { describe, expect, test, beforeEach, mock } from "bun:test" import { Dispatcher } from "../../src/altimate/native" import { Log } from "../../src/util/log" +// Static import so resetShownSuggestions() targets the SAME module instance that +// the tools (sql-analyze, schema-inspect) use. A dynamic `await import` here can +// resolve to a different instance under parallel CI, leaving the real dedup Set +// un-reset → flaky cross-test pollution of the progressive-suggestion state. +import { PostConnectSuggestions } from "../../src/altimate/tools/post-connect-suggestions" Log.init({ print: false }) @@ -37,7 +42,6 @@ function makeCtx(agent = "builder") { // --------------------------------------------------------------------------- beforeEach(async () => { Dispatcher.reset() - const { PostConnectSuggestions } = await import("../../src/altimate/tools/post-connect-suggestions") PostConnectSuggestions.resetShownSuggestions() }) @@ -291,6 +295,7 @@ describe("REAL EXEC: sql_analyze tool", () => { test("S27: sql_analyze with parse error — no suggestion", async () => { Dispatcher.reset() + PostConnectSuggestions.resetShownSuggestions() Dispatcher.register("sql.analyze", async () => ({ success: true, issues: [], issue_count: 0, confidence: "none", confidence_factors: [], error: "Parse error at line 1", From 79df91a34c221c5c3ae1301f96f17cd189148238 Mon Sep 17 00:00:00 2001 From: anandgupta42 Date: Wed, 3 Jun 2026 16:39:21 -0700 Subject: [PATCH 2/2] =?UTF-8?q?fix:=20raise=20CI=20test=20timeout=2030s?= =?UTF-8?q?=E2=86=9290s=20to=20kill=20resource-contention=20flakiness?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The "TypeScript" job runs all 9500+ tests in one parallel bun process. Under CPU contention a few slower tests (real fs/spawn/git-bootstrap) get starved and exceed the 30s per-test timeout NON-deterministically — different tests each run (observed: 32s and 51s timeouts). This blocks every PR with failures unrelated to the diff. 90s gives ~3x headroom over the worst observed, removing the flakiness without masking genuinely-hung tests. Part of #879. Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/workflows/ci.yml | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index fdb63be6b6..a9085dd023 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -98,8 +98,13 @@ jobs: # Cloud E2E tests (Snowflake, BigQuery, Databricks) auto-skip when # ALTIMATE_CODE_CONN_* env vars are not set. Docker E2E tests auto-skip # when Docker is not available. No exclusion needed — skipIf handles it. - # --timeout 30000: matches package.json "test" script; prevents 5s default - # from cutting off tests that run bun install or bootstrap git instances. + # --timeout 90000: the full suite (9500+ tests across 379 files) runs in + # one parallel bun process. Under CPU contention a handful of slower tests + # (real fs/spawn/git-bootstrap work) get starved and exceed a tight timeout + # NON-deterministically — different tests each run — failing CI with + # "timed out after Nms" (observed 32s/51s at the old 30s limit). 90s gives + # ~3x headroom over the worst observed, killing the resource-contention + # flakiness without masking genuinely-hung tests. # # Bun 1.3.x has a known segfault during process cleanup after all tests # pass (exit code 143/SIGTERM or 134/SIGABRT). We capture test output and @@ -108,7 +113,7 @@ jobs: run: | # Redirect bun output to file, then cat it for CI visibility. # This avoids tee/pipe issues where SIGTERM kills tee before flush. - bun test --timeout 30000 > /tmp/test-output.txt 2>&1 || true + bun test --timeout 90000 > /tmp/test-output.txt 2>&1 || true cat /tmp/test-output.txt # Extract pass/fail counts from Bun test summary (e.g., " 5362 pass")