From ba6cce260ff13958433a77650ac45461b90fa8ef Mon Sep 17 00:00:00 2001
From: anandgupta42 <anand@altimate.ai>
Date: Wed, 3 Jun 2026 16:27:34 -0700
Subject: [PATCH 1/2] fix: two tests flaky under parallel CI load (S27 + trace
 snapshot)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Both pass locally but fail consistently in CI's heavy parallel run (9474
tests / 378 files) — the repo's "no flaky tests under resource contention"
case. Neither is caused by any feature change; they fail identically on
unrelated PRs (#854/#858/#863), blocking all of them.

- `real-tool-simulation` S27: the progressive-suggestion dedup state is a
  module-global Set. The test's `beforeEach` reset used a dynamic
  `await import`, which under parallel CI can resolve to a different module
  instance than the tool's static import — so the real Set is never reset and
  accumulates `sql_analyze` from S25/S26 → S27 sees no suggestion. Fix: import
  `PostConnectSuggestions` statically (same instance the tools use); reset in
  S27 too.
- `tracing-adversarial-snapshot` "shows 'running' status": waited a fixed 50ms
  for a debounced async snapshot write, too short under CI load → read a stale
  snapshot. Fix: poll the on-disk status until expected (timeout 4s) instead
  of a fixed sleep.

Closes #879

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../tracing-adversarial-snapshot.test.ts      | 32 ++++++++++++++-----
 .../test/session/real-tool-simulation.test.ts |  7 +++-
 2 files changed, 30 insertions(+), 9 deletions(-)
diff --git a/packages/opencode/test/altimate/tracing-adversarial-snapshot.test.ts b/packages/opencode/test/altimate/tracing-adversarial-snapshot.test.ts
index cfe61840f6..02922b50b2 100644
--- a/packages/opencode/test/altimate/tracing-adversarial-snapshot.test.ts
+++ b/packages/opencode/test/altimate/tracing-adversarial-snapshot.test.ts
@@ -37,6 +37,26 @@ const ZERO_STEP = {
   tokens: { input: 0, output: 0, reasoning: 0, cache: { read: 0, write: 0 } },
 }
 
+// Poll the on-disk snapshot until it reaches `expected` status, instead of a
+// single fixed sleep. Snapshot writes are debounced/async, so a hardcoded delay
+// is too short under heavy parallel CI load (the snapshot hasn't flushed yet) →
+// flaky reads of a stale status. Polling is robust regardless of machine load.
+async function pollStatus(tracer: { getTracePath(): string | undefined }, expected: string, timeoutMs = 4000) {
+  const start = Date.now()
+  let last = "<none>"
+  while (Date.now() - start < timeoutMs) {
+    try {
+      const snap = JSON.parse(await fs.readFile(tracer.getTracePath()!, "utf-8")) as TraceFile
+      last = snap.summary.status
+      if (last === expected) return snap
+    } catch {
+      /* file mid-write or not yet created — keep polling */
+    }
+    await new Promise((r) => setTimeout(r, 25))
+  }
+  throw new Error(`timed out after ${timeoutMs}ms waiting for status '${expected}' (last seen '${last}')`)
+}
+
 // ---------------------------------------------------------------------------
 // 1. buildTraceFile — snapshot isolation from mutations
 // ---------------------------------------------------------------------------
@@ -110,8 +130,6 @@ describe("buildTraceFile — snapshot isolation", () => {
   test("buildTraceFile shows 'running' status during active generation", async () => {
     const tracer = Recap.withExporters([new FileExporter(tmpDir)])
     tracer.startTrace("s-running", { prompt: "test" })
-    // Wait for initial snapshot to complete
-    await new Promise((r) => setTimeout(r, 50))
 
     tracer.logStepStart({ id: "1" })
     tracer.logToolCall({
@@ -120,15 +138,13 @@ describe("buildTraceFile — snapshot isolation", () => {
       state: { status: "completed", input: {}, output: "ok", time: { start: 1, end: 2 } },
     })
 
-    // Wait for snapshot — should show "running" since generation is in progress
-    await new Promise((r) => setTimeout(r, 50))
-    const snap = JSON.parse(await fs.readFile(tracer.getTracePath()!, "utf-8")) as TraceFile
+    // Snapshot should show "running" since generation is in progress.
+    const snap = await pollStatus(tracer, "running")
     expect(snap.summary.status).toBe("running")
 
-    // After finishing generation, should show "completed"
+    // After finishing generation, should show "completed".
     tracer.logStepFinish(ZERO_STEP)
-    await new Promise((r) => setTimeout(r, 50))
-    const snap2 = JSON.parse(await fs.readFile(tracer.getTracePath()!, "utf-8")) as TraceFile
+    const snap2 = await pollStatus(tracer, "completed")
     expect(snap2.summary.status).toBe("completed")
 
     await tracer.endTrace()
diff --git a/packages/opencode/test/session/real-tool-simulation.test.ts b/packages/opencode/test/session/real-tool-simulation.test.ts
index 3b21cc2e23..1523b8b861 100644
--- a/packages/opencode/test/session/real-tool-simulation.test.ts
+++ b/packages/opencode/test/session/real-tool-simulation.test.ts
@@ -12,6 +12,11 @@
 import { describe, expect, test, beforeEach, mock } from "bun:test"
 import { Dispatcher } from "../../src/altimate/native"
 import { Log } from "../../src/util/log"
+// Static import so resetShownSuggestions() targets the SAME module instance that
+// the tools (sql-analyze, schema-inspect) use. A dynamic `await import` here can
+// resolve to a different instance under parallel CI, leaving the real dedup Set
+// un-reset → flaky cross-test pollution of the progressive-suggestion state.
+import { PostConnectSuggestions } from "../../src/altimate/tools/post-connect-suggestions"
 
 Log.init({ print: false })
 
@@ -37,7 +42,6 @@ function makeCtx(agent = "builder") {
 // ---------------------------------------------------------------------------
 beforeEach(async () => {
   Dispatcher.reset()
-  const { PostConnectSuggestions } = await import("../../src/altimate/tools/post-connect-suggestions")
   PostConnectSuggestions.resetShownSuggestions()
 })
 
@@ -291,6 +295,7 @@ describe("REAL EXEC: sql_analyze tool", () => {
 
   test("S27: sql_analyze with parse error — no suggestion", async () => {
     Dispatcher.reset()
+    PostConnectSuggestions.resetShownSuggestions()
     Dispatcher.register("sql.analyze", async () => ({
       success: true, issues: [], issue_count: 0, confidence: "none",
       confidence_factors: [], error: "Parse error at line 1",

From 79df91a34c221c5c3ae1301f96f17cd189148238 Mon Sep 17 00:00:00 2001
From: anandgupta42 <anand@altimate.ai>
Date: Wed, 3 Jun 2026 16:39:21 -0700
Subject: [PATCH 2/2] =?UTF-8?q?fix:=20raise=20CI=20test=20timeout=2030s?=
 =?UTF-8?q?=E2=86=9290s=20to=20kill=20resource-contention=20flakiness?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The "TypeScript" job runs all 9500+ tests in one parallel bun process. Under
CPU contention a few slower tests (real fs/spawn/git-bootstrap) get starved and
exceed the 30s per-test timeout NON-deterministically — different tests each run
(observed: 32s and 51s timeouts). This blocks every PR with failures unrelated
to the diff. 90s gives ~3x headroom over the worst observed, removing the
flakiness without masking genuinely-hung tests.

Part of #879.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/workflows/ci.yml | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index fdb63be6b6..a9085dd023 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -98,8 +98,13 @@ jobs:
         # Cloud E2E tests (Snowflake, BigQuery, Databricks) auto-skip when
         # ALTIMATE_CODE_CONN_* env vars are not set. Docker E2E tests auto-skip
         # when Docker is not available. No exclusion needed — skipIf handles it.
-        # --timeout 30000: matches package.json "test" script; prevents 5s default
-        # from cutting off tests that run bun install or bootstrap git instances.
+        # --timeout 90000: the full suite (9500+ tests across 379 files) runs in
+        # one parallel bun process. Under CPU contention a handful of slower tests
+        # (real fs/spawn/git-bootstrap work) get starved and exceed a tight timeout
+        # NON-deterministically — different tests each run — failing CI with
+        # "timed out after Nms" (observed 32s/51s at the old 30s limit). 90s gives
+        # ~3x headroom over the worst observed, killing the resource-contention
+        # flakiness without masking genuinely-hung tests.
         #
         # Bun 1.3.x has a known segfault during process cleanup after all tests
         # pass (exit code 143/SIGTERM or 134/SIGABRT). We capture test output and
@@ -108,7 +113,7 @@ jobs:
         run: |
           # Redirect bun output to file, then cat it for CI visibility.
           # This avoids tee/pipe issues where SIGTERM kills tee before flush.
-          bun test --timeout 30000 > /tmp/test-output.txt 2>&1 || true
+          bun test --timeout 90000 > /tmp/test-output.txt 2>&1 || true
           cat /tmp/test-output.txt
 
           # Extract pass/fail counts from Bun test summary (e.g., " 5362 pass")