AltimateAI · anandgupta42 · Jun 6, 2026 · Jun 6, 2026 · Jun 6, 2026 · cubic-dev-ai
diff --git a/packages/opencode/src/altimate/observability/tracing.ts b/packages/opencode/src/altimate/observability/tracing.ts
@@ -47,6 +47,14 @@ export interface TraceSpan {
   endTime?: number
   status: "ok" | "error"
   statusMessage?: string
+  /**
+   * True when this span was force-closed by trace reconstruction (worker
+   * restart / cache eviction) rather than by a real error. The span keeps
+   * `status: "error"` so its boundary stays visible, but consumers (the viewer,
+   * error aggregations) should treat it as "incomplete (reconstructed)" — not a
+   * genuine agent/tool failure.
+   */
+  interrupted?: boolean
 
   // --- LLM / generation fields (populated for kind=generation) ---
   model?: {
@@ -349,6 +357,62 @@ function formatDurationShort(ms: number): string {
 // Exported so the viewer's chat-tab dedupe can compare against the same boundary
 // (otherwise it'd silently drift if either side changes the magic number).
 export const USER_MESSAGE_INPUT_MAX_CHARS = 4000
+
+/**
+ * Upper bound on the number of spans serialized into a single `ses_<id>.json`.
+ * `snapshot()` rewrites the entire spans array on every event, so an unbounded
+ * long-lived session would grow the file without limit and pay O(n) per write
+ * (O(n²) over the session). When a trace exceeds this, serialization keeps the
+ * head (early context: prompt + first tools) and the tail (most recent
+ * activity) and elides the middle with a single marker span — bounding both
+ * file size and per-event write cost. In-memory spans are untouched; only the
+ * on-disk projection is capped. Override with `ALTIMATE_TRACE_MAX_SPANS`.
+ */
+export const MAX_SERIALIZED_SPANS = (() => {
+  const raw = parseInt(process.env["ALTIMATE_TRACE_MAX_SPANS"] ?? "", 10)
+  return Number.isFinite(raw) && raw > 0 ? raw : 5000
+})()
+
+/**
+ * Bound the spans written to disk to `cap` while preserving the most useful
+ * context: keep the head (root span, prompt, first tool calls) and the tail
+ * (most recent activity), and replace the elided middle with one marker span.
+ * Returns the input unchanged when it's already within the cap.
+ */
+export function capSpansForSerialization(spans: TraceSpan[], cap: number = MAX_SERIALIZED_SPANS): TraceSpan[] {
+  if (cap <= 0 || spans.length <= cap) return spans
+  const headCount = Math.max(1, Math.floor(cap * 0.3))
+  const tailCount = Math.max(1, cap - headCount - 1) // reserve one slot for the marker
+  // Only elide if the result is actually smaller than the input (+1 for the
+  // marker we'd add) — otherwise there's nothing to gain.
+  if (headCount + tailCount + 1 >= spans.length) return spans
+  let head = spans.slice(0, headCount)
-export function capSpansForSerialization(spans: TraceSpan[], cap: number = MAX_SERIALIZED_SPANS): TraceSpan[] {
-  if (cap <= 0 || spans.length <= cap) return spans
-  const headCount = Math.max(1, Math.floor(cap * 0.3))
-  const tailCount = Math.max(1, cap - headCount - 1) // reserve one slot for the marker
-  // Only elide if the result is actually smaller than the input (+1 for the
-  // marker we'd add) — otherwise there's nothing to gain.
-  if (headCount + tailCount + 1 >= spans.length) return spans
-  let head = spans.slice(0, headCount)
+export function capSpansForSerialization(spans: TraceSpan[], cap: number = MAX_SERIALIZED_SPANS): TraceSpan[] {
+  if (cap <= 0 || spans.length <= cap) return spans
+  const rootSpan = spans.find((s) => s.parentSpanId === null) ?? null
+  if (cap === 1) {
+    return rootSpan ? [rootSpan] : [spans[0]]
+  }
+  if (cap === 2) {
+    if (!rootSpan) return spans.slice(-2)
+    const lastNonRoot = [...spans].reverse().find((s) => s.spanId !== rootSpan.spanId)
+    return lastNonRoot ? [rootSpan, lastNonRoot] : [rootSpan]
+  }
+  const headCount = Math.max(1, Math.floor(cap * 0.3))
+  const tailCount = Math.max(1, cap - headCount - 1) // reserve one slot for the marker
+  // Only elide if the result is actually smaller than the input (+1 for the
+  // marker we'd add) — otherwise there's nothing to gain.
+  if (headCount + tailCount + 1 >= spans.length) return spans
+  let head = spans.slice(0, headCount)
-export function capSpansForSerialization(spans: TraceSpan[], cap: number = MAX_SERIALIZED_SPANS): TraceSpan[] {
-  if (cap <= 0 || spans.length <= cap) return spans
-  const headCount = Math.max(1, Math.floor(cap * 0.3))
-  const tailCount = Math.max(1, cap - headCount - 1) // reserve one slot for the marker
-  // Only elide if the result is actually smaller than the input (+1 for the
-  // marker we'd add) — otherwise there's nothing to gain.
-  if (headCount + tailCount + 1 >= spans.length) return spans
-  let head = spans.slice(0, headCount)
+export function capSpansForSerialization(spans: TraceSpan[], cap: number = MAX_SERIALIZED_SPANS): TraceSpan[] {
+  if (cap <= 0 || spans.length <= cap) return spans
+  const rootSpan = spans.find((s) => s.parentSpanId === null) ?? null
+  if (cap === 1) {
+    return rootSpan ? [rootSpan] : [spans[0]]
+  }
+  if (cap === 2) {
+    if (!rootSpan) return spans.slice(-2)
+    const lastNonRoot = [...spans].reverse().find((s) => s.spanId !== rootSpan.spanId)
+    return lastNonRoot ? [rootSpan, lastNonRoot] : [rootSpan]
+  }
+  const headCount = Math.max(1, Math.floor(cap * 0.3))
+  const tailCount = Math.max(1, cap - headCount - 1) // reserve one slot for the marker
+  // Only elide if the result is actually smaller than the input (+1 for the
+  // marker we'd add) — otherwise there's nothing to gain.
+  if (headCount + tailCount + 1 >= spans.length) return spans
+  let head = spans.slice(0, headCount)
+  const tail = spans.slice(spans.length - tailCount)
+  // Guarantee the structural root (session) span survives the cut even if it
+  // isn't in the head slice — rehydrate and the viewer's tree both require it,
+  // and the elision marker is parented to it. In practice the root is index 0
+  // (pushed first), so this is defensive, but it makes the invariant explicit
+  // instead of silently depending on span ordering.
+  const rootSpan = spans.find((s) => s.parentSpanId === null) ?? null
+  if (rootSpan && !head.some((s) => s.spanId === rootSpan.spanId) && !tail.some((s) => s.spanId === rootSpan.spanId)) {
+    head = [rootSpan, ...head.slice(0, headCount - 1)]
+  }
+  const elided = spans.length - head.length - tail.length
+  const rootId = rootSpan?.spanId ?? null
+  const anchor = head[head.length - 1]
+  const anchorTime = anchor?.endTime ?? anchor?.startTime ?? 0
+  const marker: TraceSpan = {
+    spanId: `elided-${head.length}-${tail.length}-of-${spans.length}`,
+    parentSpanId: rootId,
+    name: `… ${elided} spans elided (trace exceeded ${cap} spans) …`,
+    kind: "span",
+    startTime: anchorTime,
+    endTime: anchorTime,
+    status: "ok",
+    attributes: { elided, totalSpans: spans.length },
+  }
+  return [...head, marker, ...tail]
+}
 // altimate_change end
 
 export class Trace {
@@ -592,6 +656,9 @@ export class Trace {
         s.endTime = now
         s.status = "error"
         s.statusMessage = "interrupted — altimate-code restarted before this step finished recording; not an agent failure"
+        // Distinguish a recorder restart from a real failure so the viewer and
+        // error aggregations don't paint this red or count it as an incident.
+        s.interrupted = true
       }
     }
     this.endTraceStarted = false
@@ -927,6 +994,8 @@ export class Trace {
       snapshotMetadata = this.metadata
     }
 
+    snapshotSpans = capSpansForSerialization(snapshotSpans)
+
     return {
       version: 2,
       traceId: this.traceId,

diff --git a/packages/opencode/src/altimate/observability/viewer.ts b/packages/opencode/src/altimate/observability/viewer.ts
@@ -196,6 +196,7 @@ body { font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Ar
 .wf-preview .pv-tag.model { background: rgba(77,142,255,0.12); color: var(--secondary); }
 .wf-preview .pv-tag.tok { background: rgba(74,222,128,0.12); color: var(--green); }
 .wf-preview .pv-tag.err { background: rgba(248,113,113,0.12); color: var(--red); }
+.wf-preview .pv-tag.warn { background: rgba(251,191,36,0.12); color: var(--orange); }
 .wf-bar-c { flex: 1; height: 18px; position: relative; overflow: hidden; }
 .wf-bar { position: absolute; height: 100%; border-radius: 3px; min-width: 3px; opacity: 0.85; display: flex; align-items: center; padding-left: 4px; }
 .wf-bar.generation { background: var(--secondary); }
@@ -222,6 +223,7 @@ body { font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Ar
 .tree-preview .pv-tag.model { background: rgba(77,142,255,0.12); color: var(--secondary); }
 .tree-preview .pv-tag.tok { background: rgba(74,222,128,0.12); color: var(--green); }
 .tree-preview .pv-tag.err { background: rgba(248,113,113,0.12); color: var(--red); }
+.tree-preview .pv-tag.warn { background: rgba(251,191,36,0.12); color: var(--orange); }
 .tree-detail { margin-top: 8px; padding: 8px; background: var(--bg); border: 1px solid var(--border); border-radius: 6px; font-size: 12px; display: none; }
 .tree-detail.open { display: block; }
 
@@ -443,7 +445,9 @@ var icons = { session: '\\u25A0', generation: '\\u2B50', tool: '\\u2692', text:
 function getPreview(span) {
   var parts = [];
   if (span.status === 'error' && span.statusMessage) {
-    return '<span class="pv-tag err">\\u2718</span>' + e((span.statusMessage || '').slice(0, 120));
+    // Interrupted = recorder restart, not a real failure: amber warn, not red.
+    var tag = span.interrupted ? '<span class="pv-tag warn">\\u26A0</span>' : '<span class="pv-tag err">\\u2718</span>';
+    return tag + e((span.statusMessage || '').slice(0, 120));
   }
   if (span.kind === 'tool') {
     var inp = span.input;
@@ -467,7 +471,7 @@ function getPreview(span) {
         }
       }
     }
-    if (span.status === 'error') parts.unshift('<span class="pv-tag err">\\u2718</span>');
+    if (span.status === 'error') parts.unshift(span.interrupted ? '<span class="pv-tag warn">\\u26A0</span>' : '<span class="pv-tag err">\\u2718</span>');
   } else if (span.kind === 'generation') {
     if (span.model && span.model.modelId) parts.push('<span class="pv-tag model">' + e(span.model.modelId) + '</span>');
     if (span.tokens && span.tokens.total) parts.push('<span class="pv-tag tok">' + Number(span.tokens.total).toLocaleString() + ' tok</span>');
@@ -487,8 +491,9 @@ function showDetail(span) {
   var dur = (span.endTime || Date.now()) - (span.startTime || 0);
   var h = '<div class="detail-panel"><h3>' + e(span.name) + '</h3><dl class="dg">';
   h += '<dt>Kind</dt><dd>' + e(span.kind||'') + '</dd>';
-  h += '<dt>Status</dt><dd' + (span.status==='error'?' style="color:var(--red)"':'') + '>' + e(span.status||'') + '</dd>';
-  if (span.statusMessage) h += '<dt>Error</dt><dd style="color:var(--red)">' + e(span.statusMessage) + '</dd>';
+  var statusColor = span.interrupted ? 'var(--orange)' : (span.status==='error' ? 'var(--red)' : '');
+  h += '<dt>Status</dt><dd' + (statusColor?' style="color:'+statusColor+'"':'') + '>' + e(span.interrupted ? 'interrupted' : (span.status||'')) + '</dd>';
+  if (span.statusMessage) h += '<dt>' + (span.interrupted ? 'Interrupted' : 'Error') + '</dt><dd style="color:' + (span.interrupted ? 'var(--orange)' : 'var(--red)') + '">' + e(span.statusMessage) + '</dd>';
   h += '<dt>Duration</dt><dd>' + fd(dur) + '</dd>';
   if (span.model) {
     if (span.model.modelId) h += '<dt>Model</dt><dd>' + e(span.model.modelId) + '</dd>';
@@ -565,7 +570,10 @@ function showDetail(span) {
   // --- Classify all tool spans upfront ---
   var toolSpans = nonSession.filter(function(sp) { return sp.kind === 'tool'; });
   var genSpans = nonSession.filter(function(sp) { return sp.kind === 'generation'; });
-  var errSpans = nonSession.filter(function(sp) { return sp.status === 'error'; });
+  // Reconstructed (interrupted) spans keep status:'error' for boundary
+  // visibility, but they reflect a recorder restart — exclude them from the
+  // session error count so a clean session isn't reported as failed.
+  var errSpans = nonSession.filter(function(sp) { return sp.status === 'error' && !sp.interrupted; });
 
   // Categorize files: changed (edit/write) vs read
   var changedFiles = {};
@@ -1239,12 +1247,12 @@ function showDetail(span) {
     var dur = (span.endTime || Date.now()) - (span.startTime||0);
     var left = (st / tTotal * 100).toFixed(2);
     var width = Math.max(0.5, dur / tTotal * 100).toFixed(2);
-    var cls = span.status === 'error' ? 'error' : e(span.kind);
+    var cls = (span.status === 'error' && !span.interrupted) ? 'error' : e(span.kind);
     var row = document.createElement('div');
     row.className = 'wf-row';
     row.setAttribute('data-idx', String(idx));
     if (span.spanId) row.setAttribute('data-span-id', span.spanId);
-    var iconCls = span.status === 'error' ? 'error' : e(span.kind);
+    var iconCls = (span.status === 'error' && !span.interrupted) ? 'error' : e(span.kind);
     var pv = getPreview(span);
     row.innerHTML = '<div class="wf-icon ' + iconCls + '">' + (icons[span.kind]||'\\u2022') + '</div>' +
       '<div class="wf-info"><div class="wf-name">' + e(span.name) + '</div>' + (pv ? '<div class="wf-preview">' + pv + '</div>' : '') + '</div>' +
@@ -1278,7 +1286,8 @@ function showDetail(span) {
       meta.push(fd(dur));
       if (span.tokens) meta.push(Number(span.tokens.total||0) + ' tok');
       if (span.cost) meta.push(fc(span.cost));
-      if (span.status === 'error') meta.push('<span style="color:var(--red)">error</span>');
+      if (span.interrupted) meta.push('<span style="color:var(--orange)">interrupted</span>');
+      else if (span.status === 'error') meta.push('<span style="color:var(--red)">error</span>');
       html += '<div class="tree-node"><div class="tree-item" data-idx="' + idx + '">';
       html += '<div class="tree-head">';
       html += '<span class="tree-type ' + e(span.kind) + '">' + e(span.kind) + '</span>';
@@ -1390,7 +1399,7 @@ function showDetail(span) {
     if (span.kind === 'session') return;
     var idx = spans.indexOf(span);
     var ts = span.startTime ? new Date(span.startTime).toISOString().slice(11,23) : '';
-    var kindCls = span.status === 'error' ? 'error' : e(span.kind);
+    var kindCls = (span.status === 'error' && !span.interrupted) ? 'error' : e(span.kind);
     html += '<div class="log-entry" data-idx="' + idx + '">';
     html += '<span class="log-ts">' + ts + '</span>';
     var logIcon = span.kind === 'generation' ? '\\u2B50' : span.kind === 'tool' ? '\\u2692' : '\\u25A0';
@@ -1400,7 +1409,8 @@ function showDetail(span) {
     if (span.tokens) html += ' <span style="color:var(--dim);font-size:11px">' + Number(span.tokens.total||0) + ' tok</span>';
     if (span.cost) html += ' <span style="color:var(--orange);font-size:11px">' + fc(span.cost) + '</span>';
     if (span.tool && span.tool.durationMs != null) html += ' <span style="color:var(--dim);font-size:11px">' + fd(span.tool.durationMs) + '</span>';
-    if (span.status === 'error') html += ' <span style="color:var(--red);font-size:11px">\\u2718 ' + e((span.statusMessage||'').slice(0,100)) + '</span>';
+    if (span.interrupted) html += ' <span style="color:var(--orange);font-size:11px">\\u26A0 ' + e((span.statusMessage||'').slice(0,100)) + '</span>';
+    else if (span.status === 'error') html += ' <span style="color:var(--red);font-size:11px">\\u2718 ' + e((span.statusMessage||'').slice(0,100)) + '</span>';
     if (span.kind === 'tool' && span.input) {
       var logPv = getPreview(span);
       if (logPv) html += '<div class="log-data" style="color:var(--cyan);opacity:0.7;max-height:none">' + logPv + '</div>';

diff --git a/packages/opencode/src/cli/cmd/tui/worker.ts b/packages/opencode/src/cli/cmd/tui/worker.ts
@@ -47,6 +47,14 @@ const eventStream = {
   abort: undefined as AbortController | undefined,
 }
 
+// altimate_change start — trace: monotonic stream generation. Bumped on every
+// startEventStream() so an in-flight getOrCreateTrace() can detect that its
+// owning stream was torn down while it was suspended at an await. Keyed on a
+// counter rather than the AbortController's object identity so the guard does
+// not silently depend on startEventStream always allocating a fresh controller.
+let streamGeneration = 0
+// altimate_change end
+
 // altimate_change start — trace: per-session traces
 const sessionTraces = new Map<string, Trace>()
 const sessionUserMsgIds = new Map<string, Set<string>>() // Per-session user message IDs (cleaned up on session end)
@@ -83,6 +91,13 @@ async function loadTracingConfig() {
 async function getOrCreateTrace(sessionID: string): Promise<Trace | null> {
   if (!sessionID || !tracingEnabled) return null
   if (sessionTraces.has(sessionID)) return sessionTraces.get(sessionID)!
+  // altimate_change start — capture the stream generation that owns this call so
+  // we can detect a concurrent startEventStream() (e.g. setWorkspace) that
+  // aborted us and cleared the cache while we were suspended at the rehydrate
+  // await below. A counter (not AbortController identity) so we don't depend on
+  // startEventStream's allocation strategy.
+  const generationAtEntry = streamGeneration
+  // altimate_change end
   try {
     if (sessionTraces.size >= MAX_TRACES) {
       const oldest = sessionTraces.keys().next().value
@@ -106,9 +121,21 @@ async function getOrCreateTrace(sessionID: string): Promise<Trace | null> {
       trace.startTrace(sessionID, {})
     }
     // altimate_change end
+    // altimate_change start — if a new stream replaced ours while we were
+    // awaiting rehydrate, this Trace belongs to a stream that's already been
+    // aborted and its cache cleared. Inserting it now would resurrect an orphan
+    // writer into the freshly-cleared map. Discard it and defer to whatever the
+    // live stream has. The check and the set below run in the same synchronous
+    // turn (no await between them), so the insert can't race a later
+    // startEventStream — this closes the suspend-at-await hole specifically.
+    if (streamGeneration !== generationAtEntry) {
+      void trace.endTrace().catch(() => {})
+      return sessionTraces.get(sessionID) ?? null
+    }
     Trace.setActive(trace)
     sessionTraces.set(sessionID, trace)
     return trace
+    // altimate_change end
   } catch {
     return null
   }
@@ -117,6 +144,10 @@ async function getOrCreateTrace(sessionID: string): Promise<Trace | null> {
 
 const startEventStream = (input: { directory: string; workspaceID?: string }) => {
   if (eventStream.abort) eventStream.abort.abort()
+  // altimate_change start — new stream generation; invalidates any in-flight
+  // getOrCreateTrace() suspended at its rehydrate await (see generationAtEntry).
+  streamGeneration++
+  // altimate_change end
   // Clear stale per-stream trace state before starting a new stream instance
   for (const [, trace] of sessionTraces) {
     void trace.endTrace().catch(() => {})