From 81ba30fae5308d9858f00c5a6cebaf5a8d991112 Mon Sep 17 00:00:00 2001 From: rustam-callstack <192424666+rustam-callstack@users.noreply.github.com> Date: Fri, 8 May 2026 11:22:11 +0200 Subject: [PATCH 01/14] smoke: add LLM-driven Phase 1 driver for Android emulator canary Replaces the brittle bash assertion logic of Phase 0 with an LLM runner that takes plain-text test cases (numbered English steps with optional `expect:` postconditions) and uses Claude Sonnet to figure out the right agent-device CLI calls. A committed replay cache at tests/smoke/cache/.json keeps the happy path deterministic and ~\$0 in API spend; cache misses fall back to the LLM, and final-tier failures fall back to a Phase-0-style bash recipe so an Anthropic outage doesn't fail the build. Phase 0 stays untouched. Phase 1 ships as `smokeAndroidLLM.yml` with `continue-on-error: true` for the first 2 weeks; flip to required once flake rate is at parity. Files added: - .github/scripts/agent-device-cli.ts (typed wrapper around the CLI) - .github/scripts/agent-device-snapshot-signature.ts (structural cache key) - .github/scripts/agent-device-expect.ts (postcondition DSL) - .github/scripts/agent-device-replay-cache.ts (cache load/lookup/diff) - .github/scripts/agent-device-llm-client.ts (Anthropic /v1/messages with prompt cache + backoff) - .github/scripts/agent-device-llm-driver.ts (orchestrator) - .github/workflows/smokeAndroidLLM.yml (PR + dispatch trigger) - tests/smoke/android-signin.testcase.txt (4 numbered steps for SignIn flow) - package.json: smoke:android:llm script See plan: $(printf '~/.claude/plans/buzzing-mixing-dusk.md') Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/scripts/agent-device-cli.ts | 191 +++ .github/scripts/agent-device-expect.ts | 85 ++ .github/scripts/agent-device-llm-client.ts | 207 +++ .github/scripts/agent-device-llm-driver.ts | 1240 +++++++++++++++++ .github/scripts/agent-device-replay-cache.ts | 120 ++ .../agent-device-snapshot-signature.ts | 77 + .github/workflows/smokeAndroidLLM.yml | 156 +++ package.json | 1 + tests/smoke/android-signin.testcase.txt | 24 + 9 files changed, 2101 insertions(+) create mode 100644 .github/scripts/agent-device-cli.ts create mode 100644 .github/scripts/agent-device-expect.ts create mode 100644 .github/scripts/agent-device-llm-client.ts create mode 100644 .github/scripts/agent-device-llm-driver.ts create mode 100644 .github/scripts/agent-device-replay-cache.ts create mode 100644 .github/scripts/agent-device-snapshot-signature.ts create mode 100644 .github/workflows/smokeAndroidLLM.yml create mode 100644 tests/smoke/android-signin.testcase.txt diff --git a/.github/scripts/agent-device-cli.ts b/.github/scripts/agent-device-cli.ts new file mode 100644 index 000000000000..d062b12f64f5 --- /dev/null +++ b/.github/scripts/agent-device-cli.ts @@ -0,0 +1,191 @@ +// Thin TypeScript wrapper around the `agent-device` CLI. +// +// Why this exists: the CLI emits accessibility-tree snapshots as +// human-readable text (`@e4 [text-field] "Phone or email," [editable]`). +// That format is fine for humans grepping artifacts but bad for an LLM +// because: +// 1. The LLM has to re-tokenize the structure on every turn — wasteful. +// 2. Subtle whitespace/quoting differences across platforms (Android's +// trailing comma vs iOS's no comma) leak into the LLM's reasoning. +// 3. Phantom hallucinated refs are harder to detect against free text. +// +// We parse once here, hand the LLM a typed JSON array, and keep the raw +// text in the artifact for post-mortem. + +import { execFileSync } from "child_process"; + +/** + * One element in the parsed accessibility tree. The optional fields are + * absent when the underlying line lacked them; do NOT default to empty + * strings — the LLM uses presence/absence as a signal (e.g. a button with + * no text label is suspicious). + */ +export type SnapshotNode = { + ref: string; + kind: string; + text?: string; + editable: boolean; + enabled: boolean; + scrollable: boolean; +}; + +export type Snapshot = { + page?: string; + app?: string; + nodes: SnapshotNode[]; + nodeCount: number; + raw: string; +}; + +export type AppState = { + foregroundApp?: string; + activity?: string; + raw: string; +}; + +const SESSION = process.env.AGENT_DEVICE_SESSION ?? "ci"; + +// Bound every CLI invocation so a hung emulator can't wedge the smoke. +// 30s is generous for read-only commands (snapshot/screenshot) and +// effectively a "this should have completed already" tripwire. +const CLI_TIMEOUT_MS = 30_000; + +function run(args: string[]): string { + return execFileSync("agent-device", args, { + encoding: "utf8", + timeout: CLI_TIMEOUT_MS, + maxBuffer: 8 * 1024 * 1024, + }); +} + +function tryRun(args: string[]): { + stdout: string; + ok: boolean; + error?: Error; +} { + try { + return { stdout: run(args), ok: true }; + } catch (e) { + return { stdout: "", ok: false, error: e as Error }; + } +} + +/** + * Parse a single snapshot line of the form: + * `@e4 [text-field] "Phone or email," [editable]` + * `@e5 [button] "Continue"` + * `@e2 [scroll-area] [scrollable]` + * + * The `agent-device` CLI's text format isn't a stable contract, so this + * parser is deliberately permissive: anything that doesn't fit the shape + * is dropped (and counted in nodeCount via the header line, not by + * counting parsed children — so we don't quietly hide drift). + */ +function parseNodeLine(line: string): SnapshotNode | null { + const refMatch = line.match(/^@(e\d+)\s+\[([a-z-]+)\]/); + if (!refMatch) { + return null; + } + const [, refIndex, kind] = refMatch; + const after = line.slice(refMatch[0].length).trim(); + + let text: string | undefined; + const textMatch = after.match(/^"((?:[^"\\]|\\.)*)"/); + if (textMatch) { + text = textMatch[1].replace(/,$/, ""); + } + + const flags = after.toLowerCase(); + return { + ref: `@${refIndex}`, + kind, + text, + editable: flags.includes("[editable]"), + enabled: !flags.includes("[disabled]"), + scrollable: flags.includes("[scrollable]"), + }; +} + +export function parseSnapshot(raw: string): Snapshot { + const lines = raw.split("\n"); + const nodes: SnapshotNode[] = []; + let page: string | undefined; + let app: string | undefined; + let nodeCount = 0; + + for (const line of lines) { + if (line.startsWith("Page:")) { + page = line.slice("Page:".length).trim(); + continue; + } + if (line.startsWith("App:")) { + app = line.slice("App:".length).trim(); + continue; + } + const countMatch = line.match(/^Snapshot:\s*(\d+)/); + if (countMatch) { + nodeCount = Number(countMatch[1]); + continue; + } + const node = parseNodeLine(line.trim()); + if (node) { + nodes.push(node); + } + } + return { page, app, nodes, nodeCount, raw }; +} + +export function parseAppState(raw: string): AppState { + const fg = raw.match(/Foreground app:\s*(\S+)/); + const act = raw.match(/Activity:\s*(\S+)/); + return { foregroundApp: fg?.[1], activity: act?.[1], raw }; +} + +// ---- public surface used by the runner ------------------------------- + +export function snapshot(): Snapshot { + return parseSnapshot(run(["snapshot", "-i", "--session", SESSION])); +} + +export function screenshotBase64(path: string): string { + run(["screenshot", path, "--session", SESSION]); + // The CLI writes to disk; the runner reads + base64-encodes itself + // (we keep this wrapper free of fs to keep the signatures simple). + return path; +} + +export function appstate(): AppState { + return parseAppState(run(["appstate", "--session", SESSION])); +} + +export function fill(ref: string, text: string): void { + run(["fill", ref, text, "--session", SESSION]); +} + +export function press(ref: string): void { + run(["press", ref, "--session", SESSION]); +} + +export function closeSession(): void { + // Idempotent — if there's no session, this is a no-op. + tryRun(["close", "--session", SESSION]); +} + +export function adbKey(keyEvent: number): void { + // Used by the LLM's `back()` and `dismiss_keyboard()` tools. We + // shell out to adb directly rather than agent-device because the + // CLI doesn't expose a keyevent primitive. + execFileSync("adb", ["shell", "input", "keyevent", String(keyEvent)], { + timeout: CLI_TIMEOUT_MS, + encoding: "utf8", + }); +} + +/** + * Find nodes whose text contains the given substring (case-insensitive). + * Side-effect-free; operates on a snapshot already in memory. + */ +export function findInSnapshot(snap: Snapshot, needle: string): SnapshotNode[] { + const n = needle.toLowerCase(); + return snap.nodes.filter((node) => node.text?.toLowerCase().includes(n)); +} diff --git a/.github/scripts/agent-device-expect.ts b/.github/scripts/agent-device-expect.ts new file mode 100644 index 000000000000..95754e89fe49 --- /dev/null +++ b/.github/scripts/agent-device-expect.ts @@ -0,0 +1,85 @@ +// `expect:` DSL — machine-checked postcondition for each test step. +// +// Why a tiny DSL instead of letting the LLM self-report success: +// `step_complete(rationale)` is an LLM claim, not evidence. A canary +// that trusts an LLM's claim is a canary the LLM can lie to. The +// `expect:` clause is evaluated by deterministic TypeScript code +// against the post-state snapshot/appstate. The step fails red if +// `expect:` fails, regardless of what the LLM said. +// +// Grammar (intentionally small — extend only when a real test step +// can't be expressed): +// snapshot.contains_text("...") +// snapshot.field_with_text("...").exists +// appstate.foreground == "..." +// +// String literal: double-quoted, backslash-escapable. No interpolation, +// no regex, no boolean ops. If a step needs more, write a second step. + +import type { AppState, Snapshot } from "./agent-device-cli"; + +export type ExpectResult = { ok: true } | { ok: false; reason: string }; + +const STR = String.raw`"((?:[^"\\]|\\.)*)"`; + +const PATTERNS: Array<{ + re: RegExp; + eval: (m: RegExpMatchArray, snap: Snapshot, app: AppState) => ExpectResult; +}> = [ + { + re: new RegExp(`^snapshot\\.contains_text\\(${STR}\\)$`), + eval: (m, snap) => { + const needle = m[1].toLowerCase(); + const hit = snap.nodes.some((n) => + n.text?.toLowerCase().includes(needle), + ); + return hit + ? { ok: true } + : { + ok: false, + reason: `no node contains text ${JSON.stringify(m[1])} (snapshot has ${snap.nodes.length} nodes)`, + }; + }, + }, + { + re: new RegExp(`^snapshot\\.field_with_text\\(${STR}\\)\\.exists$`), + eval: (m, snap) => { + const needle = m[1].toLowerCase(); + const hit = snap.nodes.some( + (n) => n.editable && n.text?.toLowerCase().includes(needle), + ); + return hit + ? { ok: true } + : { + ok: false, + reason: `no editable field contains text ${JSON.stringify(m[1])}`, + }; + }, + }, + { + re: new RegExp(`^appstate\\.foreground\\s*==\\s*${STR}$`), + eval: (m, _snap, app) => { + return app.foregroundApp === m[1] + ? { ok: true } + : { + ok: false, + reason: `foreground app is ${app.foregroundApp ?? "(unknown)"}, expected ${m[1]}`, + }; + }, + }, +]; + +export function evaluateExpect( + clause: string, + snap: Snapshot, + app: AppState, +): ExpectResult { + const trimmed = clause.trim(); + for (const p of PATTERNS) { + const m = trimmed.match(p.re); + if (m) { + return p.eval(m, snap, app); + } + } + return { ok: false, reason: `unrecognized expect clause: ${clause}` }; +} diff --git a/.github/scripts/agent-device-llm-client.ts b/.github/scripts/agent-device-llm-client.ts new file mode 100644 index 000000000000..72e2e2816cb1 --- /dev/null +++ b/.github/scripts/agent-device-llm-client.ts @@ -0,0 +1,207 @@ +// Thin client for the Anthropic /v1/messages endpoint. +// +// Decisions baked in: +// - Direct `fetch` instead of `@anthropic-ai/sdk` to avoid a new +// dependency on a CI-only path. Node 20 has fetch built in. +// - Prompt caching (`cache_control: {type: "ephemeral"}`) on the +// system message and the last tool definition. The system + tool +// surface is static across the run, so cache hit rate after step 1 +// is ~100%, cutting per-call cost by 5-10x. The 5-minute TTL fits +// a single CI run with margin. +// - Bounded exponential backoff with jitter for 429/500/502/503/529. +// The runner's caller decides what to do on final failure (typically +// fall back to a deterministic bash-style assertion); this client +// never silently degrades. +// - Token budget kill-switch: total input+output tokens accumulated +// across the run; throw if exceeded. Bounds runaway spend if a +// prompt or tool design accidentally explodes context. + +export type AnthropicTool = { + name: string; + description: string; + input_schema: Record; + cache_control?: { type: "ephemeral" }; +}; + +export type AnthropicMessage = { + role: "user" | "assistant"; + content: Array< + | { type: "text"; text: string } + | { + type: "image"; + source: { type: "base64"; media_type: "image/png"; data: string }; + } + | { + type: "tool_use"; + id: string; + name: string; + input: Record; + } + | { + type: "tool_result"; + tool_use_id: string; + content: string; + is_error?: boolean; + } + >; +}; + +export type AnthropicResponse = { + id: string; + stop_reason: + | "end_turn" + | "tool_use" + | "max_tokens" + | "stop_sequence" + | string; + content: Array< + | { type: "text"; text: string } + | { + type: "tool_use"; + id: string; + name: string; + input: Record; + } + >; + usage: { + input_tokens: number; + output_tokens: number; + cache_read_input_tokens?: number; + cache_creation_input_tokens?: number; + }; +}; + +export type ClientOptions = { + apiKey: string; + model: string; + tokenBudget: number; + /** Prefix written to artifacts/llm-trace.jsonl for post-mortem. */ + traceWriter?: (entry: Record) => void; +}; + +const ANTHROPIC_VERSION = "2023-06-01"; +const RETRY_DELAYS_MS = [1_000, 3_000, 9_000]; +const RETRYABLE_STATUS = new Set([429, 500, 502, 503, 529]); + +export class TokenBudgetExceededError extends Error { + constructor(used: number, budget: number) { + super(`token budget exceeded: ${used} > ${budget}`); + } +} + +export class AnthropicCallFailedError extends Error { + constructor( + public readonly status: number, + public readonly body: string, + ) { + super(`Anthropic API failed with status ${status}: ${body.slice(0, 200)}`); + } +} + +export class AnthropicClient { + private tokensUsed = 0; + + constructor(private readonly opts: ClientOptions) {} + + getTokensUsed(): number { + return this.tokensUsed; + } + + async call(args: { + system: string; + tools: AnthropicTool[]; + messages: AnthropicMessage[]; + maxTokens?: number; + }): Promise { + // Mark system + last tool as cacheable. Anthropic caches the + // contiguous prefix UP TO each `cache_control` marker, so two + // markers means "cache through end of system" and "cache + // through end of tools" as separate cached prefixes. + const cachedTools = args.tools.map((t, i) => + i === args.tools.length - 1 + ? { ...t, cache_control: { type: "ephemeral" as const } } + : t, + ); + + const body = { + model: this.opts.model, + max_tokens: args.maxTokens ?? 1024, + temperature: 0, + system: [ + { + type: "text", + text: args.system, + cache_control: { type: "ephemeral" }, + }, + ], + tools: cachedTools, + messages: args.messages, + }; + + let lastError: Error | undefined; + for (let attempt = 0; attempt <= RETRY_DELAYS_MS.length; attempt++) { + try { + const response = await this.callOnce(body); + this.accountForUsage(response.usage); + this.opts.traceWriter?.({ + type: "response", + attempt, + stop_reason: response.stop_reason, + usage: response.usage, + }); + return response; + } catch (e) { + lastError = e as Error; + if (e instanceof TokenBudgetExceededError) { + throw e; + } + const retryable = + e instanceof AnthropicCallFailedError && + RETRYABLE_STATUS.has(e.status); + if (!retryable || attempt >= RETRY_DELAYS_MS.length) { + throw e; + } + const base = RETRY_DELAYS_MS[attempt]; + const jitter = base * 0.3 * (Math.random() * 2 - 1); + const wait = Math.max(0, Math.round(base + jitter)); + this.opts.traceWriter?.({ + type: "retry", + attempt, + status: (e as AnthropicCallFailedError).status, + waitMs: wait, + }); + await new Promise((r) => setTimeout(r, wait)); + } + } + throw lastError ?? new Error("unreachable"); + } + + private async callOnce(body: object): Promise { + const res = await fetch("https://api.anthropic.com/v1/messages", { + method: "POST", + headers: { + "content-type": "application/json", + "x-api-key": this.opts.apiKey, + "anthropic-version": ANTHROPIC_VERSION, + }, + body: JSON.stringify(body), + }); + if (!res.ok) { + throw new AnthropicCallFailedError(res.status, await res.text()); + } + return (await res.json()) as AnthropicResponse; + } + + private accountForUsage(usage: AnthropicResponse["usage"]): void { + // Cache reads cost roughly 10% of normal input tokens, but for + // budget-protection purposes we count them at face value — + // budgets are about runaway prompt design, not pricing. + this.tokensUsed += usage.input_tokens + usage.output_tokens; + if (this.tokensUsed > this.opts.tokenBudget) { + throw new TokenBudgetExceededError( + this.tokensUsed, + this.opts.tokenBudget, + ); + } + } +} diff --git a/.github/scripts/agent-device-llm-driver.ts b/.github/scripts/agent-device-llm-driver.ts new file mode 100644 index 000000000000..84e2ce7ab1fe --- /dev/null +++ b/.github/scripts/agent-device-llm-driver.ts @@ -0,0 +1,1240 @@ +// Phase-1 LLM-driven Android smoke runner. +// +// Lifecycle inside the workflow's emulator-runner `script:` block: +// +// 1. Boot dance (deterministic, NOT LLM-driven): +// - close any stale agent-device session +// - locate dev APK from android/app/build/outputs/... +// - adb install +// - adb reverse tcp:8081 tcp:8081 (Metro reachable from emulator) +// - npm start & (Metro background) +// - poll /status until packager-status:running +// - agent-device open --relaunch (cold start) +// +// 2. Test-case execution: +// - parse test case (numbered steps + optional `expect:` lines) +// - per step: cache-first / LLM-fallback / bash-fallback ladder +// - assert post-state via `expect:` evaluator +// - write artifacts (screenshots, snapshots, llm-trace, cache-diff) +// +// 3. Cleanup (always — even on signal/error): +// - dump logcat once +// - close agent-device session (so re-runs aren't tripped by the +// "session already bound" guard) +// - kill background jobs (Metro) +// +// Why a TS runner instead of Python or Bash: +// - The repo already runs ts-node in CI (precedent: createDocsRoutes.ts). +// - Reusing the snapshot parser + signature + expect DSL across +// replay / LLM / bash paths means one source of truth for what +// "the SignIn screen is on screen" means — a divergence between +// "what bash sees" and "what the LLM sees" would be a class of +// bugs we don't want. + +import { execFileSync, spawn } from "child_process"; +import fs from "fs"; +import path from "path"; +import * as adCli from "./agent-device-cli"; +import type { Snapshot, AppState } from "./agent-device-cli"; +import { + snapshotSignature, + refToLocator, + locatorToRef, +} from "./agent-device-snapshot-signature"; +import { evaluateExpect } from "./agent-device-expect"; +import * as cache from "./agent-device-replay-cache"; +import type { CachedAction, CacheV1 } from "./agent-device-replay-cache"; +import { + AnthropicClient, + TokenBudgetExceededError, + AnthropicCallFailedError, +} from "./agent-device-llm-client"; +import type { + AnthropicTool, + AnthropicMessage, +} from "./agent-device-llm-client"; + +// ---- config ----------------------------------------------------------- + +const MODEL = process.env.ANTHROPIC_MODEL ?? "claude-sonnet-4-6"; +const TOKEN_BUDGET = Number(process.env.LLM_TOKEN_BUDGET ?? 200_000); +const APP_PACKAGE = process.env.APP_PACKAGE ?? "com.expensify.chat.dev"; +const SESSION = process.env.AGENT_DEVICE_SESSION ?? "ci"; +const ARTIFACTS_DIR = process.env.ARTIFACTS_DIR ?? "artifacts"; +const TEST_CASE_PATH = + process.argv[2] ?? "tests/smoke/android-signin.testcase.txt"; +const CACHE_PATH = + process.env.LLM_CACHE_PATH ?? deriveCachePath(TEST_CASE_PATH); +const APK_GLOB = "android/app/build/outputs/apk/development/debug"; +const METRO_READY_TIMEOUT_MS = 120_000; +const SIGNIN_LOAD_TIMEOUT_MS = 360_000; +const STEP_WALL_CLOCK_BUDGET_MS = 60_000; +const MAX_STATE_CHANGING_ACTIONS = 4; +const SCREENSHOT_BUDGET_PER_RUN = 2; +const TEXT_LENGTH_CAP = 200; + +// ---- types ------------------------------------------------------------ + +type Step = { + number: number; + text: string; + expect: string | null; + raw: string; +}; + +type ToolResultBlock = { + type: "tool_result"; + tool_use_id: string; + content: string; + is_error?: boolean; +}; +type ContentBlock = + | { type: "text"; text: string } + | { + type: "image"; + source: { type: "base64"; media_type: "image/png"; data: string }; + } + | ToolResultBlock; + +type ExecutedAction = CachedAction & { ref?: string }; + +// ---- entry point ------------------------------------------------------ + +async function main(): Promise { + fs.mkdirSync(ARTIFACTS_DIR, { recursive: true }); + registerCleanup(); + + log( + `runner=${MODEL} test_case=${TEST_CASE_PATH} cache=${CACHE_PATH} budget=${TOKEN_BUDGET}`, + ); + + const testCaseRaw = fs.readFileSync(TEST_CASE_PATH, "utf8"); + const testCaseHash = cache.hashText(testCaseRaw); + const steps = parseTestCase(testCaseRaw); + if (!steps.length) { + fail("test case has no steps"); + } + + const committed = cache.loadCache(CACHE_PATH, MODEL, testCaseHash); + const recorded: CacheV1 = { + version: 1, + model: MODEL, + testCaseHash, + steps: [], + }; + + await bootApp(); + + const apiKey = process.env.ANTHROPIC_API_KEY; + const llm = apiKey + ? new AnthropicClient({ + apiKey, + model: MODEL, + tokenBudget: TOKEN_BUDGET, + traceWriter: (e) => + fs.appendFileSync( + path.join(ARTIFACTS_DIR, "llm-trace.jsonl"), + `${JSON.stringify(e)}\n`, + ), + }) + : null; + if (!llm) { + log( + "::warning::ANTHROPIC_API_KEY missing — every step will use bash fallback", + ); + } + + let cacheHits = 0; + let llmRuns = 0; + let bashRuns = 0; + + for (const step of steps) { + const result = await executeStep(step, { + committed, + testCaseHash, + llm, + recorded, + stats: { + onCacheHit: () => cacheHits++, + onLLMRun: () => llmRuns++, + onBashRun: () => bashRuns++, + }, + }); + if (!result.ok) { + fail(`step ${step.number} failed: ${result.reason}`); + } + } + + // Always write the recorded cache diff, even if it's identical. + // Reviewers want to see a clean (no-op) diff to know the canary + // ran end-to-end without UI drift. + const diffText = cache.diff(committed, recorded); + fs.writeFileSync( + path.join(ARTIFACTS_DIR, "cache-diff.txt"), + `${diffText || "(no drift — cache up to date)"}\n`, + ); + cache.writeCache(path.join(ARTIFACTS_DIR, "cache-recorded.json"), recorded); + + log( + `::notice::smoke OK — cache_hits=${cacheHits} llm_runs=${llmRuns} bash_runs=${bashRuns} tokens=${llm?.getTokensUsed() ?? 0}`, + ); + + if (diffText) { + log( + "::warning::cache drift detected — copy artifacts/cache-recorded.json to tests/smoke/cache/.json and commit", + ); + } +} + +// ---- test case parser ------------------------------------------------- + +function parseTestCase(raw: string): Step[] { + const steps: Step[] = []; + let cur: Step | null = null; + for (const lineRaw of raw.split("\n")) { + const line = lineRaw.trimEnd(); + if (!line.trim() || line.trim().startsWith("#")) { + continue; + } + const m = line.match(/^(\d+)\.\s+(.*)$/); + if (m) { + if (cur) { + steps.push(cur); + } + cur = { number: Number(m[1]), text: m[2], expect: null, raw: line }; + continue; + } + const ex = line.match(/^\s*expect:\s*(.+)$/); + if (ex && cur) { + cur.expect = ex[1]; + cur.raw += `\n${line}`; + } + } + if (cur) { + steps.push(cur); + } + return steps; +} + +// ---- boot dance (matches Phase 0's bash) ------------------------------ + +async function bootApp(): Promise { + log("boot: closing stale session"); + adCli.closeSession(); + + log("boot: locating APK"); + const apkDir = APK_GLOB; + const files = fs.existsSync(apkDir) + ? fs.readdirSync(apkDir).filter((f) => f.endsWith(".apk")) + : []; + if (!files.length) { + fail(`no APK found under ${apkDir} — Rock build step likely failed`); + } + const apk = path.join(apkDir, files[0]); + log(`boot: installing ${apk}`); + execFileSync("adb", ["install", "-r", "-d", "-t", apk], { stdio: "inherit" }); + + log("boot: adb reverse 8081"); + execFileSync("adb", ["reverse", "tcp:8081", "tcp:8081"], { + stdio: "inherit", + }); + + log("boot: starting Metro"); + const metroLog = fs.openSync(path.join(ARTIFACTS_DIR, "metro.log"), "a"); + const metro = spawn("npm", ["start"], { + stdio: ["ignore", metroLog, metroLog], + detached: true, + }); + metro.unref(); + backgroundPids.push(metro.pid!); + + await waitForMetro(); + + log("boot: agent-device open --relaunch"); + const serial = execFileSync("adb", ["get-serialno"], { + encoding: "utf8", + }).trim(); + execFileSync( + "agent-device", + [ + "open", + APP_PACKAGE, + "--platform", + "android", + "--serial", + serial, + "--session", + SESSION, + "--relaunch", + ], + { + stdio: "inherit", + }, + ); + + // Bounded wait for the SignIn UI to hydrate. The LLM can technically + // poll for it itself in step 1, but on slow runners (~290s observed) + // that would burn LLM budget on what's effectively boot-blocking + // emulator wait time. Better to gate the LLM on a known-ready UI. + log("boot: waiting for SignIn UI"); + const start = Date.now(); + while (Date.now() - start < SIGNIN_LOAD_TIMEOUT_MS) { + const snap = adCli.snapshot(); + if ( + snap.nodes.some((n) => n.text?.toLowerCase().includes("phone or email")) + ) { + log( + `boot: SignIn ready after ${Math.round((Date.now() - start) / 1000)}s`, + ); + return; + } + await sleep(6_000); + } + fail(`SignIn UI not ready within ${SIGNIN_LOAD_TIMEOUT_MS / 1000}s`); +} + +async function waitForMetro(): Promise { + const start = Date.now(); + while (Date.now() - start < METRO_READY_TIMEOUT_MS) { + try { + const out = execFileSync( + "curl", + ["-sf", "http://localhost:8081/status"], + { encoding: "utf8" }, + ); + if (out.includes("packager-status:running")) { + log( + `boot: Metro ready after ${Math.round((Date.now() - start) / 1000)}s`, + ); + return; + } + } catch { + // Metro not up yet + } + await sleep(2_000); + } + fail( + `Metro did not reach packager-status:running within ${METRO_READY_TIMEOUT_MS / 1000}s`, + ); +} + +// ---- per-step orchestration ------------------------------------------- + +type StepCtx = { + committed: CacheV1; + testCaseHash: string; + llm: AnthropicClient | null; + recorded: CacheV1; + stats: { + onCacheHit: () => void; + onLLMRun: () => void; + onBashRun: () => void; + }; +}; + +async function executeStep( + step: Step, + ctx: StepCtx, +): Promise<{ ok: true } | { ok: false; reason: string }> { + const preSnap = adCli.snapshot(); + const preSig = snapshotSignature(preSnap); + const stepKey = `step ${step.number}`; + log(`::group::${stepKey} — ${step.text}`); + + fs.writeFileSync( + path.join(ARTIFACTS_DIR, `step-${step.number}-pre.txt`), + preSnap.raw, + ); + + const cached = cache.lookup(ctx.committed, step.number, preSig); + if (cached) { + log(`${stepKey}: cache hit (pre_sig=${preSig})`); + const replay = await replayCachedActions(cached.actions); + if (replay.ok) { + const post = await verifyPostState(step, cached.postSignature); + if (post.ok) { + ctx.stats.onCacheHit(); + ctx.recorded.steps.push(cached); + fs.writeFileSync( + path.join(ARTIFACTS_DIR, `step-${step.number}-post.txt`), + post.snap.raw, + ); + log(`::endgroup::`); + return { ok: true }; + } + log(`${stepKey}: cache drift — ${post.reason}; falling through to LLM`); + } else { + log( + `${stepKey}: replay failed — ${replay.reason}; falling through to LLM`, + ); + } + } + + let actions: ExecutedAction[] = []; + if (ctx.llm) { + try { + const llmResult = await runLLMStep(step, ctx.llm); + if (!llmResult.ok) { + log( + `${stepKey}: LLM gave up — ${llmResult.reason}; trying bash fallback`, + ); + } else { + ctx.stats.onLLMRun(); + actions = llmResult.actions; + } + } catch (e) { + if (e instanceof TokenBudgetExceededError) { + return { ok: false, reason: e.message }; + } + log( + `${stepKey}: LLM call failed (${(e as Error).message}); trying bash fallback`, + ); + } + } + + if (!actions.length) { + const bashResult = await runBashFallback(step); + if (!bashResult.ok) { + log(`::endgroup::`); + return { ok: false, reason: bashResult.reason }; + } + ctx.stats.onBashRun(); + actions = bashResult.actions; + } + + const post = await verifyPostState(step, null); + if (!post.ok) { + log(`::endgroup::`); + return { ok: false, reason: post.reason }; + } + fs.writeFileSync( + path.join(ARTIFACTS_DIR, `step-${step.number}-post.txt`), + post.snap.raw, + ); + + ctx.recorded.steps.push({ + stepNumber: step.number, + stepTextHash: cache.hashText(step.text), + preSignature: preSig, + postSignature: snapshotSignature(post.snap), + actions: actions.map(stripExecutedRef), + expect: step.expect, + recordedAt: new Date().toISOString(), + runId: process.env.GITHUB_RUN_ID ?? "local", + }); + log(`::endgroup::`); + return { ok: true }; +} + +function stripExecutedRef(a: ExecutedAction): CachedAction { + const { ref, ...rest } = a as ExecutedAction & { ref?: string }; + return rest; +} + +async function verifyPostState( + step: Step, + expectedSignature: string | null, +): Promise<{ ok: true; snap: Snapshot } | { ok: false; reason: string }> { + const snap = adCli.snapshot(); + const app = adCli.appstate(); + if (expectedSignature && snapshotSignature(snap) !== expectedSignature) { + return { + ok: false, + reason: `post-state signature drift (recorded ${expectedSignature}, observed ${snapshotSignature(snap)})`, + }; + } + if (step.expect) { + const ev = evaluateExpect(step.expect, snap, app); + if (!ev.ok) { + return { ok: false, reason: `expect failed: ${ev.reason}` }; + } + } + return { ok: true, snap }; +} + +// ---- cache replay ----------------------------------------------------- + +async function replayCachedActions( + actions: CachedAction[], +): Promise<{ ok: true } | { ok: false; reason: string }> { + for (const action of actions) { + const ok = await dispatchCachedAction(action); + if (!ok.ok) { + return ok; + } + // Tiny settle gap — even on warm runners, fill→press in + // immediate succession occasionally lands the press before + // React has propagated the fill. + await sleep(150); + } + return { ok: true }; +} + +async function dispatchCachedAction( + action: CachedAction, +): Promise<{ ok: true } | { ok: false; reason: string }> { + if (action.tool === "wait") { + await sleep(action.ms); + return { ok: true }; + } + if (action.tool === "wait_for") { + return await runWaitFor(action.predicate, action.timeoutMs); + } + if (action.tool === "back") { + adCli.adbKey(4); + return { ok: true }; + } + if (action.tool === "dismiss_keyboard") { + adCli.adbKey(111); + return { ok: true }; + } + const snap = adCli.snapshot(); + const ref = locatorToRef(snap, action.locator); + if (!ref) { + return { + ok: false, + reason: `cached locator did not resolve: ${JSON.stringify(action.locator)}`, + }; + } + if (action.tool === "fill") { + adCli.fill(ref, action.text); + return { ok: true }; + } + if (action.tool === "press") { + adCli.press(ref); + return { ok: true }; + } + return { + ok: false, + reason: `unknown cached tool: ${(action as { tool: string }).tool}`, + }; +} + +async function runWaitFor( + predicate: string, + timeoutMs: number, +): Promise<{ ok: true } | { ok: false; reason: string }> { + const start = Date.now(); + while (Date.now() - start < timeoutMs) { + const snap = adCli.snapshot(); + const app = adCli.appstate(); + const ev = evaluateExpect(predicate, snap, app); + if (ev.ok) { + return { ok: true }; + } + await sleep(250); + } + return { + ok: false, + reason: `wait_for timed out after ${timeoutMs}ms (predicate: ${predicate})`, + }; +} + +// ---- LLM step --------------------------------------------------------- + +const SYSTEM_PROMPT = [ + "You are an autonomous mobile UI test runner driving the Expensify Android app via the agent-device CLI.", + "You receive: the current step description in plain English, an accessibility snapshot of the live UI, and a history of your tool calls within this step.", + "", + "Snapshot format: a JSON array of `{ref, kind, text, editable, enabled, scrollable}` nodes. Each ref is a stable handle for that node within this snapshot only — re-snapshot before reusing refs from a prior turn.", + "", + "Rules:", + "- Never invent a ref. Always pick refs from the most recent snapshot's `nodes` array.", + "- After any state-changing action (fill, press, back, dismiss_keyboard, wait), call snapshot to refresh before asserting.", + "- Use `assert` to prove a step succeeded — `step_complete` without an `assert` first is suspicious.", + "- Prefer `wait_for(predicate)` over `wait(ms)`. The bare wait is a last resort; the runner logs a warning each time it is used.", + "- Treat label text as advisory; it may be localized. Match by intent and element kind.", + "- If after 2-3 unique attempts you cannot make progress, call `step_failed` with a precise reason.", +].join("\n"); + +const TOOLS: AnthropicTool[] = [ + { + name: "snapshot", + description: + "Capture a fresh accessibility tree. Returns {nodes: [...], node_count: number}. Call this after any state-changing action and before using a ref from a previous turn.", + input_schema: { + type: "object", + properties: {}, + additionalProperties: false, + }, + }, + { + name: "screenshot", + description: + "Capture a PNG screenshot. Rate-limited to 2 calls per run; the runner may auto-attach a screenshot when a snapshot returns 0 nodes. Use this only when the snapshot is genuinely empty or when you've addressed phantom refs twice.", + input_schema: { + type: "object", + properties: {}, + additionalProperties: false, + }, + }, + { + name: "find", + description: + "Search the most recent snapshot for nodes whose `text` contains the given substring (case-insensitive). Side-effect-free.", + input_schema: { + type: "object", + properties: { needle: { type: "string" } }, + required: ["needle"], + additionalProperties: false, + }, + }, + { + name: "fill", + description: "Type text into the editable text-field at the given ref.", + input_schema: { + type: "object", + properties: { ref: { type: "string" }, text: { type: "string" } }, + required: ["ref", "text"], + additionalProperties: false, + }, + }, + { + name: "press", + description: "Tap the pressable element at the given ref.", + input_schema: { + type: "object", + properties: { ref: { type: "string" } }, + required: ["ref"], + additionalProperties: false, + }, + }, + { + name: "wait_for", + description: + 'Poll snapshots until `predicate` is satisfied or `timeout_ms` elapses. Predicates: snapshot.contains_text("..."), snapshot.field_with_text("...").exists, appstate.foreground == "...".', + input_schema: { + type: "object", + properties: { + predicate: { type: "string" }, + timeout_ms: { type: "integer", maximum: 10_000 }, + }, + required: ["predicate"], + additionalProperties: false, + }, + }, + { + name: "wait", + description: + "Sleep for the given number of milliseconds (max 2000). Last resort — prefer wait_for. The runner logs a warning each call.", + input_schema: { + type: "object", + properties: { ms: { type: "integer", minimum: 1, maximum: 2_000 } }, + required: ["ms"], + additionalProperties: false, + }, + }, + { + name: "back", + description: + "Press Android back. Use to recover from an unintended screen.", + input_schema: { + type: "object", + properties: {}, + additionalProperties: false, + }, + }, + { + name: "dismiss_keyboard", + description: "Dismiss the soft keyboard.", + input_schema: { + type: "object", + properties: {}, + additionalProperties: false, + }, + }, + { + name: "assert", + description: + "Verify a postcondition. Returns {ok: bool, reason?: string}. Predicates as in wait_for.", + input_schema: { + type: "object", + properties: { predicate: { type: "string" } }, + required: ["predicate"], + additionalProperties: false, + }, + }, + { + name: "appstate", + description: "Return {foreground_app, activity}.", + input_schema: { + type: "object", + properties: {}, + additionalProperties: false, + }, + }, + { + name: "step_complete", + description: + "Mark the current step as passed. Must include a brief rationale describing what was observed (mention the assert you ran).", + input_schema: { + type: "object", + properties: { rationale: { type: "string" } }, + required: ["rationale"], + additionalProperties: false, + }, + }, + { + name: "step_failed", + description: + "Mark the current step as failed. Use when 2-3 unique attempts have not produced progress, or when the screen does not match what the step expects.", + input_schema: { + type: "object", + properties: { reason: { type: "string" } }, + required: ["reason"], + additionalProperties: false, + }, + }, +]; + +async function runLLMStep( + step: Step, + llm: AnthropicClient, +): Promise< + { ok: true; actions: ExecutedAction[] } | { ok: false; reason: string } +> { + const startedAt = Date.now(); + let snap = adCli.snapshot(); + let app = adCli.appstate(); + let stateChanging = 0; + let phantomStreak = 0; + let attachScreenshotNext = false; + let screenshotsUsed = 0; + const seen = new Set(); + const messages: AnthropicMessage[] = []; + const executed: ExecutedAction[] = []; + + while ( + Date.now() - startedAt < STEP_WALL_CLOCK_BUDGET_MS && + stateChanging <= MAX_STATE_CHANGING_ACTIONS + ) { + if (snap.nodeCount === 0 && screenshotsUsed < SCREENSHOT_BUDGET_PER_RUN) { + attachScreenshotNext = true; + } + + const userBlocks: ContentBlock[] = []; + if (attachScreenshotNext && screenshotsUsed < SCREENSHOT_BUDGET_PER_RUN) { + const png = takeScreenshot( + `step-${step.number}-shot-${screenshotsUsed}.png`, + ); + screenshotsUsed++; + attachScreenshotNext = false; + userBlocks.push({ + type: "image", + source: { type: "base64", media_type: "image/png", data: png }, + }); + } + userBlocks.push({ + type: "text", + text: buildUserText(step, snap, app, executed), + }); + messages.push({ role: "user", content: userBlocks }); + + const response = await llm.call({ + system: SYSTEM_PROMPT, + tools: TOOLS, + messages, + }); + const assistantContent = response.content as AnthropicMessage["content"]; + messages.push({ role: "assistant", content: assistantContent }); + + const toolUses = assistantContent.filter( + ( + b, + ): b is Extract< + (typeof assistantContent)[number], + { type: "tool_use" } + > => b.type === "tool_use", + ); + if (!toolUses.length) { + return { ok: false, reason: "LLM returned no tool calls" }; + } + + const toolResults: ToolResultBlock[] = []; + for (const tu of toolUses) { + const sigKey = `${tu.name}:${JSON.stringify(tu.input)}:${snapshotSignature(snap)}`; + if (seen.has(sigKey)) { + toolResults.push({ + type: "tool_result", + tool_use_id: tu.id, + content: + "You already performed this exact action against this exact UI state and it produced no observable change. Try a different approach or call step_failed.", + is_error: true, + }); + continue; + } + seen.add(sigKey); + + try { + const out = await dispatchTool(tu.name, tu.input, { + snap, + app, + onSnap: (s) => { + snap = s; + }, + onApp: (a) => { + app = a; + }, + executed, + stepNumber: step.number, + onPhantom: () => { + phantomStreak++; + if ( + phantomStreak >= 2 && + screenshotsUsed < SCREENSHOT_BUDGET_PER_RUN + ) { + attachScreenshotNext = true; + } + }, + resetPhantom: () => { + phantomStreak = 0; + }, + }); + if (isStateChangingTool(tu.name)) { + stateChanging++; + } + if (out.terminal === "complete") { + return { ok: true, actions: executed }; + } + if (out.terminal === "failed") { + return { + ok: false, + reason: out.reason ?? "step_failed without reason", + }; + } + toolResults.push({ + type: "tool_result", + tool_use_id: tu.id, + content: out.content, + is_error: out.isError, + }); + } catch (e) { + toolResults.push({ + type: "tool_result", + tool_use_id: tu.id, + content: `tool error: ${(e as Error).message}`, + is_error: true, + }); + } + } + + messages.push({ role: "user", content: toolResults }); + } + + return { + ok: false, + reason: "wall-clock or distinct-action budget exhausted", + }; +} + +function isStateChangingTool(name: string): boolean { + return [ + "fill", + "press", + "back", + "dismiss_keyboard", + "wait", + "wait_for", + ].includes(name); +} + +function buildUserText( + step: Step, + snap: Snapshot, + app: AppState, + history: ExecutedAction[], +): string { + const lines: string[] = []; + lines.push(`Current step: ${step.number}. ${step.text}`); + if (step.expect) { + lines.push( + `Postcondition the runner will check (NOT for you to call directly): ${step.expect}`, + ); + } + if (history.length) { + const tail = history.slice(-3).map((h) => describeExecutedAction(h)); + lines.push(`Recent actions you took: ${tail.join("; ")}`); + } + lines.push( + `appstate.foreground=${app.foregroundApp ?? "(unknown)"} activity=${app.activity ?? "(unknown)"}`, + ); + lines.push(`snapshot.node_count=${snap.nodeCount}`); + lines.push("snapshot.nodes:"); + lines.push(JSON.stringify(snap.nodes.map(scrubNodeForPrompt), null, 0)); + return lines.join("\n"); +} + +function scrubNodeForPrompt( + n: Snapshot["nodes"][number], +): Record { + const text = n.text + ? sanitizeText(n.text).slice(0, TEXT_LENGTH_CAP) + : undefined; + return { + ref: n.ref, + kind: n.kind, + text, + editable: n.editable, + enabled: n.enabled, + scrollable: n.scrollable, + }; +} + +function sanitizeText(s: string): string { + let out = ""; + for (const ch of s) { + const c = ch.charCodeAt(0); + if (c >= 0x20 || c === 0x09 || c === 0x0a) { + out += ch; + } + } + return out; +} + +function describeExecutedAction(a: ExecutedAction): string { + if (a.tool === "fill") { + return `fill(${JSON.stringify(a.locator)}, "${a.text.slice(0, 30)}…")`; + } + if (a.tool === "press") { + return `press(${JSON.stringify(a.locator)})`; + } + if (a.tool === "wait_for") { + return `wait_for(${a.predicate}, ${a.timeoutMs}ms)`; + } + if (a.tool === "wait") { + return `wait(${a.ms}ms)`; + } + return a.tool; +} + +// ---- LLM tool dispatch ------------------------------------------------ + +type DispatchCtx = { + snap: Snapshot; + app: AppState; + onSnap: (s: Snapshot) => void; + onApp: (a: AppState) => void; + executed: ExecutedAction[]; + stepNumber: number; + onPhantom: () => void; + resetPhantom: () => void; +}; + +type DispatchResult = { + content: string; + isError?: boolean; + terminal?: "complete" | "failed"; + reason?: string; +}; + +async function dispatchTool( + name: string, + input: Record, + ctx: DispatchCtx, +): Promise { + switch (name) { + case "snapshot": { + const s = adCli.snapshot(); + ctx.onSnap(s); + return { + content: JSON.stringify({ + node_count: s.nodeCount, + nodes: s.nodes.map(scrubNodeForPrompt), + }), + }; + } + case "screenshot": { + const file = `step-${ctx.stepNumber}-llm-shot.png`; + const data = takeScreenshot(file); + return { + content: `screenshot saved at ${file} (${data.length} bytes base64). Re-snapshot to keep working with refs.`, + }; + } + case "find": { + const needle = String(input.needle ?? ""); + const matches = adCli.findInSnapshot(ctx.snap, needle).map((n) => ({ + ref: n.ref, + kind: n.kind, + text: n.text, + editable: n.editable, + })); + return { content: JSON.stringify({ matches, count: matches.length }) }; + } + case "fill": { + const ref = String(input.ref ?? ""); + const text = String(input.text ?? ""); + const node = ctx.snap.nodes.find((n) => n.ref === ref); + if (!node) { + ctx.onPhantom(); + return { + content: `phantom ref ${ref} not in current snapshot`, + isError: true, + }; + } + ctx.resetPhantom(); + adCli.fill(ref, text); + const loc = refToLocator(ctx.snap, ref); + if (loc) { + ctx.executed.push({ tool: "fill", locator: loc, text, ref }); + } + return { content: `filled ${ref}` }; + } + case "press": { + const ref = String(input.ref ?? ""); + const node = ctx.snap.nodes.find((n) => n.ref === ref); + if (!node) { + ctx.onPhantom(); + return { + content: `phantom ref ${ref} not in current snapshot`, + isError: true, + }; + } + ctx.resetPhantom(); + adCli.press(ref); + const loc = refToLocator(ctx.snap, ref); + if (loc) { + ctx.executed.push({ tool: "press", locator: loc, ref }); + } + return { content: `pressed ${ref}` }; + } + case "wait": { + const ms = Math.min(2_000, Math.max(1, Number(input.ms ?? 0))); + log(`::warning::LLM used wait(${ms}) — prefer wait_for`); + await sleep(ms); + ctx.executed.push({ tool: "wait", ms }); + return { content: `slept ${ms}ms` }; + } + case "wait_for": { + const predicate = String(input.predicate ?? ""); + const timeoutMs = Math.min( + 10_000, + Math.max(250, Number(input.timeout_ms ?? 5_000)), + ); + const r = await runWaitFor(predicate, timeoutMs); + ctx.executed.push({ tool: "wait_for", predicate, timeoutMs }); + ctx.onSnap(adCli.snapshot()); + ctx.onApp(adCli.appstate()); + return { + content: r.ok + ? "predicate satisfied" + : `wait_for timed out: ${r.reason}`, + isError: !r.ok, + }; + } + case "back": + adCli.adbKey(4); + ctx.executed.push({ tool: "back" }); + ctx.onSnap(adCli.snapshot()); + return { content: "back pressed" }; + case "dismiss_keyboard": + adCli.adbKey(111); + ctx.executed.push({ tool: "dismiss_keyboard" }); + ctx.onSnap(adCli.snapshot()); + return { content: "keyboard dismissed" }; + case "assert": { + const predicate = String(input.predicate ?? ""); + const ev = evaluateExpect(predicate, ctx.snap, ctx.app); + return { content: JSON.stringify(ev), isError: !ev.ok }; + } + case "appstate": { + const a = adCli.appstate(); + ctx.onApp(a); + return { content: JSON.stringify(a) }; + } + case "step_complete": + return { content: "step accepted by runner", terminal: "complete" }; + case "step_failed": + return { + content: "step rejected by LLM", + terminal: "failed", + reason: String(input.reason ?? "no reason given"), + }; + default: + return { content: `unknown tool: ${name}`, isError: true }; + } +} + +function takeScreenshot(filename: string): string { + const p = path.join(ARTIFACTS_DIR, filename); + adCli.screenshotBase64(p); + return fs.readFileSync(p).toString("base64"); +} + +// ---- bash fallback ---------------------------------------------------- + +// Mirrors Phase 0's bash logic for the SignIn flow. Used when: +// - ANTHROPIC_API_KEY is missing +// - The Anthropic API exhausts retries with HTTP errors +// - The LLM gives up via step_failed (rare; mostly defensive) +// +// Only the SignIn-flow steps are covered. Adding a new test case +// without LLM access requires extending this map. That's intentional: +// the bash fallback is a safety net for known flows, not a generic +// drop-in for the LLM. + +async function runBashFallback( + step: Step, +): Promise< + { ok: true; actions: ExecutedAction[] } | { ok: false; reason: string } +> { + const text = step.text.toLowerCase(); + + if (text.includes("wait") && text.includes("signin")) { + // Boot dance already gated on this; an instant pass is fine. + return { ok: true, actions: [] }; + } + + if (text.includes("enter") && text.includes("email")) { + const m = step.text.match(/"([^"]+)"/); + if (!m) { + return { + ok: false, + reason: "bash fallback could not extract email from step text", + }; + } + const snap = adCli.snapshot(); + const field = snap.nodes.find( + (n) => + n.editable && + (n.kind === "text-field" || + (n.text?.toLowerCase().includes("phone") ?? false)), + ); + if (!field) { + return { + ok: false, + reason: "bash fallback: no editable text-field for email entry", + }; + } + adCli.fill(field.ref, m[1]); + const loc = refToLocator(snap, field.ref); + return { + ok: true, + actions: loc ? [{ tool: "fill", locator: loc, text: m[1] }] : [], + }; + } + + if (text.includes("press") && text.includes("continue")) { + const snap = adCli.snapshot(); + const btn = snap.nodes.find( + (n) => n.kind === "button" && n.text?.toLowerCase().includes("continue"), + ); + if (!btn) { + return { ok: false, reason: "bash fallback: no Continue button found" }; + } + adCli.press(btn.ref); + const loc = refToLocator(snap, btn.ref); + return { ok: true, actions: loc ? [{ tool: "press", locator: loc }] : [] }; + } + + if (text.includes("magic")) { + const start = Date.now(); + while (Date.now() - start < 60_000) { + const snap = adCli.snapshot(); + if ( + snap.nodes.some((n) => n.text?.toLowerCase().includes("magic code")) + ) { + return { + ok: true, + actions: [ + { + tool: "wait_for", + predicate: 'snapshot.contains_text("Magic code")', + timeoutMs: 60_000, + }, + ], + }; + } + await sleep(2_000); + } + return { + ok: false, + reason: "bash fallback: magic-code screen never appeared", + }; + } + + return { + ok: false, + reason: `bash fallback has no recipe for step text: ${step.text}`, + }; +} + +// ---- cleanup ---------------------------------------------------------- + +const backgroundPids: number[] = []; +let cleanedUp = false; + +function registerCleanup(): void { + const handler = (): void => { + if (cleanedUp) { + return; + } + cleanedUp = true; + try { + execFileSync( + "adb", + [ + "logcat", + "-d", + "-v", + "time", + "*:W", + "ReactNativeJS:V", + "ReactNative:V", + ], + { + stdio: [ + "ignore", + fs.openSync(path.join(ARTIFACTS_DIR, "logcat.txt"), "w"), + "ignore", + ], + }, + ); + } catch { + // best effort + } + adCli.closeSession(); + for (const pid of backgroundPids) { + try { + process.kill(-pid, "SIGTERM"); + } catch { + // already gone + } + } + }; + process.on("exit", handler); + process.on("SIGINT", () => { + handler(); + process.exit(130); + }); + process.on("SIGTERM", () => { + handler(); + process.exit(143); + }); +} + +// ---- helpers ---------------------------------------------------------- + +function deriveCachePath(testCasePath: string): string { + const base = path.basename(testCasePath, path.extname(testCasePath)); + return path.join("tests", "smoke", "cache", `${base}.json`); +} + +function log(msg: string): void { + process.stdout.write(`${msg}\n`); +} + +function sleep(ms: number): Promise { + return new Promise((r) => setTimeout(r, ms)); +} + +function fail(reason: string): never { + log(`::error::${reason}`); + process.exit(1); +} + +main().catch((e: unknown) => { + if (e instanceof AnthropicCallFailedError) { + fail(`anthropic API failed: ${e.status} ${e.body.slice(0, 200)}`); + } + fail(`runner crashed: ${(e as Error).stack ?? String(e)}`); +}); diff --git a/.github/scripts/agent-device-replay-cache.ts b/.github/scripts/agent-device-replay-cache.ts new file mode 100644 index 000000000000..bf33b24cf5aa --- /dev/null +++ b/.github/scripts/agent-device-replay-cache.ts @@ -0,0 +1,120 @@ +// Replay cache for the LLM-driven smoke. +// +// Without this cache, every PR run pays the LLM round-trip cost on +// every step. Worse, every run is non-deterministic. With it, the +// happy path costs ~$0 and runs deterministically; only when the +// snapshot signature changes (real UI shape change) do we fall +// through to the LLM. +// +// The cache file lives at `tests/smoke/cache/.json` and +// is committed. The diff in code review is the human-readable +// signal that "the SignIn UI shape changed" — the property +// reviewers want to see. + +import { createHash } from "crypto"; +import fs from "fs"; +import path from "path"; +import type { RoleLocator } from "./agent-device-snapshot-signature"; + +export type CachedAction = + | { tool: "fill"; locator: RoleLocator; text: string } + | { tool: "press"; locator: RoleLocator } + | { tool: "back" } + | { tool: "dismiss_keyboard" } + | { tool: "wait"; ms: number } + | { tool: "wait_for"; predicate: string; timeoutMs: number }; + +export type CachedStep = { + stepNumber: number; + stepTextHash: string; + preSignature: string; + postSignature: string; + actions: CachedAction[]; + expect: string | null; + recordedAt: string; + runId: string; +}; + +export type CacheV1 = { + version: 1; + model: string; + testCaseHash: string; + steps: CachedStep[]; +}; + +export function hashText(s: string): string { + return createHash("sha256").update(s).digest("hex").slice(0, 16); +} + +export function loadCache( + filePath: string, + model: string, + testCaseHash: string, +): CacheV1 { + if (!fs.existsSync(filePath)) { + return { version: 1, model, testCaseHash, steps: [] }; + } + const raw = JSON.parse(fs.readFileSync(filePath, "utf8")) as CacheV1; + if (raw.version !== 1) { + throw new Error( + `Cache version mismatch at ${filePath}: expected 1, got ${raw.version}`, + ); + } + return raw; +} + +/** + * Cache hit requires three things to line up: + * 1. test_case_hash — the test file itself hasn't been edited + * 2. step_number — we're at the right step in the sequence + * 3. pre_signature — we're staring at the same UI shape we recorded + * + * If any drift, we fall through to the LLM and (on success) the + * runner emits a cache-diff to artifacts. The PR check fails red, + * forcing the contributor to commit the updated cache. + */ +export function lookup( + cache: CacheV1, + stepNumber: number, + preSignature: string, +): CachedStep | null { + return ( + cache.steps.find( + (s) => s.stepNumber === stepNumber && s.preSignature === preSignature, + ) ?? null + ); +} + +export function diff(committed: CacheV1, recorded: CacheV1): string { + const lines: string[] = []; + for (const s of recorded.steps) { + const prior = committed.steps.find((c) => c.stepNumber === s.stepNumber); + if (!prior) { + lines.push( + `+ step ${s.stepNumber}: NEW (pre=${s.preSignature}, post=${s.postSignature})`, + ); + continue; + } + if (prior.preSignature !== s.preSignature) { + lines.push( + `~ step ${s.stepNumber}: pre_signature ${prior.preSignature} → ${s.preSignature}`, + ); + } + if (prior.postSignature !== s.postSignature) { + lines.push( + `~ step ${s.stepNumber}: post_signature ${prior.postSignature} → ${s.postSignature}`, + ); + } + if (JSON.stringify(prior.actions) !== JSON.stringify(s.actions)) { + lines.push( + `~ step ${s.stepNumber}: actions changed (${prior.actions.length} → ${s.actions.length})`, + ); + } + } + return lines.join("\n"); +} + +export function writeCache(filePath: string, cache: CacheV1): void { + fs.mkdirSync(path.dirname(filePath), { recursive: true }); + fs.writeFileSync(filePath, `${JSON.stringify(cache, null, 2)}\n`); +} diff --git a/.github/scripts/agent-device-snapshot-signature.ts b/.github/scripts/agent-device-snapshot-signature.ts new file mode 100644 index 000000000000..d7cead27aa17 --- /dev/null +++ b/.github/scripts/agent-device-snapshot-signature.ts @@ -0,0 +1,77 @@ +// Structural signature of a UI snapshot. +// +// The signature is the cache key for the replay system: cache hits replay +// recorded actions, cache misses fall back to the LLM. For that to work, +// the signature must be: +// +// 1. STABLE across cosmetic UI changes — locale rotation, A/B copy +// tests, visible user data, dynamic timestamps. We exclude visible +// `text` content for this reason. A label changing from +// "Continue" to "Submit" must NOT bust the cache (the replay layer +// finds the button by role + position, then the LLM recovery layer +// handles a real shape change if any). +// +// 2. SENSITIVE to structural change — a new button appearing, an +// input becoming non-editable, a screen transitioning to a +// different layout. These are the events that invalidate a +// recorded action sequence. +// +// Net effect: localization or copy churn doesn't trigger an LLM call, +// but real UI shape change does. + +import { createHash } from "crypto"; +import type { Snapshot, SnapshotNode } from "./agent-device-cli"; + +function project(node: SnapshotNode): string { + return [ + node.kind, + node.text ? "T1" : "T0", + node.editable ? "E1" : "E0", + node.enabled ? "N1" : "N0", + node.scrollable ? "S1" : "S0", + ].join("|"); +} + +export function snapshotSignature(snap: Snapshot): string { + const projected = snap.nodes.map(project).join("\n"); + return createHash("sha256").update(projected).digest("hex").slice(0, 16); +} + +/** + * Locator that survives across runs even though `@eN` refs do not. + * The runner re-resolves to a concrete `@ref` against the live + * snapshot at replay time. + * + * Example: `{kind: "text-field", index: 0, editable: true}` → + * "the first editable text-field in the current snapshot". + */ +export type RoleLocator = { + kind: string; + index: number; + editable?: boolean; +}; + +export function refToLocator(snap: Snapshot, ref: string): RoleLocator | null { + const sameKind = snap.nodes.filter( + (n) => n.kind === snap.nodes.find((m) => m.ref === ref)?.kind, + ); + const idx = sameKind.findIndex((n) => n.ref === ref); + if (idx < 0) { + return null; + } + const node = sameKind[idx]; + return { kind: node.kind, index: idx, editable: node.editable || undefined }; +} + +export function locatorToRef(snap: Snapshot, loc: RoleLocator): string | null { + const matches = snap.nodes.filter((n) => { + if (n.kind !== loc.kind) { + return false; + } + if (loc.editable !== undefined && n.editable !== loc.editable) { + return false; + } + return true; + }); + return matches[loc.index]?.ref ?? null; +} diff --git a/.github/workflows/smokeAndroidLLM.yml b/.github/workflows/smokeAndroidLLM.yml new file mode 100644 index 000000000000..4c32238f5dba --- /dev/null +++ b/.github/workflows/smokeAndroidLLM.yml @@ -0,0 +1,156 @@ +name: Android Smoke (agent-device · Phase 1, LLM-driven) + +# Phase-1 build-health canary: same emulator + APK + boot dance as +# Phase 0, but the test steps are now plain English and an LLM driver +# (Claude Sonnet) figures out which agent-device calls to make. A +# committed replay cache at tests/smoke/cache/.json keeps the +# happy path deterministic and ~$0 in API spend; cache misses fall +# back to the LLM, and final-tier failures (API down, LLM gives up) +# fall back to a deterministic Phase-0-style bash recipe so an +# Anthropic outage doesn't fail the build. +# +# Initial rollout: continue-on-error: true so this is non-blocking +# while we compare reliability against Phase 0 over a 2-week window. +# Once flake rate <= Phase 0's, flip to required and retire Phase 0. + +on: + pull_request: + types: [opened, synchronize] + branches-ignore: [staging, production] + # Don't ignore tests/ or .github/ — Phase 1 fires on changes to + # the test cases, the runner scripts, and the workflow itself. + paths-ignore: + - docs/** + - help/** + - contributingGuides/** + - "**.md" + workflow_dispatch: + +concurrency: + group: smoke-android-llm-${{ github.head_ref || github.ref }} + cancel-in-progress: true + +jobs: + smoke: + name: Android emulator smoke (LLM-driven) + if: ${{ github.actor != 'OSBotify' }} + # Non-blocking during the rollout window. The recommendation in the + # Phase 1 plan is to flip this to false (or remove the line) after + # 2 weeks if Phase 1's flake rate <= Phase 0's. + continue-on-error: true + runs-on: blacksmith-4vcpu-ubuntu-2404 + timeout-minutes: 35 + env: + AGENT_DEVICE_VERSION: "0.14.7" + # Hard kill-switch: total input+output tokens accumulated across + # the run. Bounds runaway spend if a prompt or tool design + # accidentally explodes context. ~$1 worst-case at sonnet 4.6 + # rates without prompt cache; in practice with prompt cache the + # happy path uses 5-10x less. + LLM_TOKEN_BUDGET: "200000" + ANTHROPIC_MODEL: "claude-sonnet-4-6" + + steps: + - name: Checkout + # v6 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd + with: + submodules: true + token: ${{ secrets.OS_BOTIFY_TOKEN }} + + - name: Verify KVM / fix permissions if needed + run: | + if ! ls -la /dev/kvm 2>/dev/null; then + echo "::error::No /dev/kvm on this runner — emulator will fall back to TCG and the job will time out" + exit 1 + fi + if [ ! -w /dev/kvm ]; then + echo 'KERNEL=="kvm", GROUP="kvm", MODE="0666", OPTIONS+="static_node=kvm"' \ + | sudo tee /etc/udev/rules.d/99-kvm4all.rules + sudo udevadm control --reload-rules + sudo udevadm trigger --name-match=kvm + fi + + - name: Setup Java + uses: actions/setup-java@3a4f6e1af504cf6a31855fa899c6aa5355ba6c12 + with: + distribution: temurin + java-version: "17" + + - name: Setup Node + uses: ./.github/actions/composite/setupNode + with: + IS_HYBRID_BUILD: "false" + + - name: Install agent-device CLI + run: npm install -g "agent-device@${AGENT_DEVICE_VERSION}" + + - name: Configure AWS credentials (Rock S3 cache) + uses: aws-actions/configure-aws-credentials@8df5847569e6427dd6c4fb1cf565c83acfa8afa7 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: us-east-1 + + - name: Configure MapBox SDK + run: ./scripts/setup-mapbox-sdk.sh ${{ secrets.MAPBOX_SDK_DOWNLOAD_TOKEN }} + + - name: Install Android CMake 3.30.5 (Hermes pins this exact version) + run: | + yes | "$ANDROID_HOME/cmdline-tools/latest/bin/sdkmanager" --licenses > /dev/null 2>&1 || true + "$ANDROID_HOME/cmdline-tools/latest/bin/sdkmanager" --install "cmake;3.30.5" 2>&1 | tail -5 + + - name: Build / fetch developmentDebug APK via Rock + env: + STANDALONE_NEW_DOT: "true" + run: npx rock build:android --variant developmentDebug + + - name: AVD cache + uses: actions/cache@9255dc7a253b0ccc959486e2bca901246202afeb + id: avd-cache + with: + path: | + ~/.android/avd/* + ~/.android/adb* + ~/.android/adbkey + ~/.android/adbkey.pub + key: avd-pixel8-api35-x86_64-v1-${{ hashFiles('.github/workflows/smokeAndroidLLM.yml') }} + + - name: Prime AVD snapshot (cache miss only) + if: steps.avd-cache.outputs.cache-hit != 'true' + uses: reactivecircus/android-emulator-runner@v2 + with: + api-level: 35 + target: google_apis + arch: x86_64 + profile: pixel_8 + force-avd-creation: false + emulator-options: -no-window -gpu swiftshader_indirect -noaudio -no-boot-anim + disable-animations: false + script: | + adb wait-for-device + until [ -n "$(adb shell getprop sys.boot_completed | tr -d '\r')" ]; do sleep 2; done + echo "AVD primed" + + - name: Run smoke (LLM-driven) + uses: reactivecircus/android-emulator-runner@v2 + env: + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + with: + api-level: 35 + target: google_apis + arch: x86_64 + profile: pixel_8 + force-avd-creation: false + emulator-options: -no-snapshot-save -no-window -gpu swiftshader_indirect -noaudio -no-boot-anim + disable-animations: true + script: npm run smoke:android:llm + + - name: Upload artifacts + if: always() + uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f + with: + name: smoke-android-llm-${{ github.run_id }}-${{ github.run_attempt }} + path: artifacts/ + if-no-files-found: warn + retention-days: 14 diff --git a/package.json b/package.json index 3681ee193d4e..c526ddc80729 100644 --- a/package.json +++ b/package.json @@ -35,6 +35,7 @@ "createDocsRoutes": "ts-node .github/scripts/createDocsRoutes.ts", "generateAllowedUrls": "ts-node .github/scripts/generateAllowedUrls.ts", "detectRedirectCycle": "ts-node .github/scripts/detectRedirectCycle.ts", + "smoke:android:llm": "ts-node .github/scripts/agent-device-llm-driver.ts", "ios-build": "bundle exec fastlane ios build_unsigned", "ios-hybrid-build": "bundle exec fastlane ios build_unsigned_hybrid", "android-build": "bundle exec fastlane android build_local", diff --git a/tests/smoke/android-signin.testcase.txt b/tests/smoke/android-signin.testcase.txt new file mode 100644 index 000000000000..b0b9add4c232 --- /dev/null +++ b/tests/smoke/android-signin.testcase.txt @@ -0,0 +1,24 @@ +# Phase-1 LLM-driven Android smoke — SignIn flow. +# +# Each step is plain English the LLM reads to decide what UI actions to +# take. The optional `expect:` line is a machine-checked postcondition +# evaluated by the runner (NOT the LLM) after the step's tool calls +# complete; it is what gives the canary a hard pass/fail signal +# independent of the LLM's self-assessment. +# +# Expect predicates supported (see .github/scripts/agent-device-expect.ts): +# snapshot.contains_text("...") +# snapshot.field_with_text("...").exists +# appstate.foreground == "..." + +1. Wait for the app to fully load and the SignIn screen to appear. + expect: snapshot.contains_text("Phone or email") + +2. Enter "rustam.zeinalov@callstack.com" into the email/phone field. + expect: snapshot.field_with_text("rustam.zeinalov@callstack.com").exists + +3. Press the Continue button. + expect: appstate.foreground == "com.expensify.chat.dev" + +4. Wait for the magic-code screen to appear. + expect: snapshot.contains_text("Magic code") From 749bff22ebee20a67c4de31bcaf9e95d328bda27 Mon Sep 17 00:00:00 2001 From: rustam-callstack <192424666+rustam-callstack@users.noreply.github.com> Date: Fri, 8 May 2026 12:43:58 +0200 Subject: [PATCH 02/14] smoke(llm): bump SignIn boot timeout to 600s + capture probe artifacts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 1's first fork-test run timed out at the 360s SignIn-wait budget without uploading any diagnostics — the runner exited via fail() before writing snapshots/screenshots, so post-mortem only had logcat. - 360s -> 600s. Phase 0 saw 294s on a warm AVD; the first run of a new workflow can't reuse that cache (key includes the workflow filename), so it pays the cold-prime cost and needs more headroom. - Every 30s during the wait, dump probe snapshot text to artifacts so we can see the timeline of UI states the app traversed. - On final timeout, capture snapshot + appstate + PNG screenshot before failing so the failure is debuggable from a single artifact upload. - Don't let a transient snapshot exception kill the whole wait — log and retry. The agent-device CLI occasionally times out under emulator load and the next poll usually succeeds. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/scripts/agent-device-llm-driver.ts | 61 ++++++++++++++++++++-- 1 file changed, 56 insertions(+), 5 deletions(-) diff --git a/.github/scripts/agent-device-llm-driver.ts b/.github/scripts/agent-device-llm-driver.ts index 84e2ce7ab1fe..5ce2bb592330 100644 --- a/.github/scripts/agent-device-llm-driver.ts +++ b/.github/scripts/agent-device-llm-driver.ts @@ -67,7 +67,12 @@ const CACHE_PATH = process.env.LLM_CACHE_PATH ?? deriveCachePath(TEST_CASE_PATH); const APK_GLOB = "android/app/build/outputs/apk/development/debug"; const METRO_READY_TIMEOUT_MS = 120_000; -const SIGNIN_LOAD_TIMEOUT_MS = 360_000; +// 600s gives ~2× margin over Phase 0's observed 294s (warm AVD). The +// first run on a fresh AVD-cache key is closer to a cold boot since +// the prime+run happens in two separate emulator-runner invocations +// and the snapshot-load overhead lands inside this budget. +const SIGNIN_LOAD_TIMEOUT_MS = 600_000; +const BOOT_PROBE_INTERVAL_MS = 30_000; const STEP_WALL_CLOCK_BUDGET_MS = 60_000; const MAX_STATE_CHANGING_ACTIONS = 4; const SCREENSHOT_BUDGET_PER_RUN = 2; @@ -273,13 +278,30 @@ async function bootApp(): Promise { ); // Bounded wait for the SignIn UI to hydrate. The LLM can technically - // poll for it itself in step 1, but on slow runners (~290s observed) - // that would burn LLM budget on what's effectively boot-blocking - // emulator wait time. Better to gate the LLM on a known-ready UI. + // poll for it itself in step 1, but on slow runners that would burn + // LLM budget on what's effectively boot-blocking emulator wait time. + // We dump a probe snapshot every 30s during the wait so post-mortem + // can see *what* the app was showing if the wait times out — the + // first run of this workflow had no such artifacts and the failure + // was undebuggable from the upload. log("boot: waiting for SignIn UI"); const start = Date.now(); + let probeIdx = 0; + let lastProbeAt = 0; while (Date.now() - start < SIGNIN_LOAD_TIMEOUT_MS) { - const snap = adCli.snapshot(); + let snap; + try { + snap = adCli.snapshot(); + } catch (e) { + // Don't let a single transient snapshot timeout kill the wait — + // the emulator may be under heavy load and the next poll will + // probably succeed. + log( + `boot: snapshot threw (${(e as Error).message.slice(0, 80)}); retrying`, + ); + await sleep(2_000); + continue; + } if ( snap.nodes.some((n) => n.text?.toLowerCase().includes("phone or email")) ) { @@ -288,8 +310,37 @@ async function bootApp(): Promise { ); return; } + if (Date.now() - lastProbeAt >= BOOT_PROBE_INTERVAL_MS) { + const elapsed = Math.round((Date.now() - start) / 1000); + fs.writeFileSync( + path.join( + ARTIFACTS_DIR, + `boot-probe-${String(probeIdx).padStart(2, "0")}-t${elapsed}s.txt`, + ), + snap.raw, + ); + probeIdx++; + lastProbeAt = Date.now(); + } await sleep(6_000); } + // Capture as much state as we can BEFORE failing so a re-run isn't + // required to debug. The cleanup trap will still write logcat after. + try { + const snap = adCli.snapshot(); + fs.writeFileSync( + path.join(ARTIFACTS_DIR, "boot-timeout-snapshot.txt"), + snap.raw, + ); + const app = adCli.appstate(); + fs.writeFileSync( + path.join(ARTIFACTS_DIR, "boot-timeout-appstate.txt"), + app.raw, + ); + adCli.screenshotBase64(path.join(ARTIFACTS_DIR, "boot-timeout.png")); + } catch (e) { + log(`boot: timeout-diagnostics capture failed: ${(e as Error).message}`); + } fail(`SignIn UI not ready within ${SIGNIN_LOAD_TIMEOUT_MS / 1000}s`); } From f8aee09fd1130bac48a593795ae862e83152a283 Mon Sep 17 00:00:00 2001 From: rustam-callstack <192424666+rustam-callstack@users.noreply.github.com> Date: Fri, 8 May 2026 13:41:04 +0200 Subject: [PATCH 03/14] smoke(llm): recover from Pixel Launcher ANR dialog during boot wait Previous fork-test runs showed every probe stuck on a system "Pixel Launcher isn't responding" dialog with `Close app` / `Wait` buttons, sitting on top of our (correctly-foregrounded) Expensify activity. The 2-core ubuntu-latest runner can't keep up with Metro + APK launch + launcher init simultaneously, so the launcher ANRs and the accessibility tree gets captured by the dialog overlay. Two fixes: 1. Pre-emptively `settings put global hide_error_dialogs 1` so the OS suppresses ANR dialogs system-wide (the underlying ANR still happens but the foreground app stays uncovered). 2. In-loop recovery: if the snapshot looks like an ANR dialog (exactly two buttons labelled "Close app" + "Wait"), press Wait to dismiss, then `am start` our activity to force-foreground, and continue polling. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/scripts/agent-device-llm-driver.ts | 75 ++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/.github/scripts/agent-device-llm-driver.ts b/.github/scripts/agent-device-llm-driver.ts index 5ce2bb592330..e74cb9f73c3a 100644 --- a/.github/scripts/agent-device-llm-driver.ts +++ b/.github/scripts/agent-device-llm-driver.ts @@ -244,6 +244,25 @@ async function bootApp(): Promise { stdio: "inherit", }); + // Pre-emptive ANR suppression. On the 2-core ubuntu-latest runner + // the Pixel Launcher routinely ANRs under the combined load of + // Metro + APK launch + agent-device. The system normally shows a + // blocking "isn't responding" dialog that hides our app behind it. + // Setting hide_error_dialogs=1 makes the OS suppress those dialogs + // (the underlying ANR still happens but the foreground app keeps + // running uncovered). Best-effort — if the property doesn't exist + // on this Android version, fall through and let the in-loop + // recovery handle it. + try { + execFileSync( + "adb", + ["shell", "settings", "put", "global", "hide_error_dialogs", "1"], + { timeout: 5_000, stdio: "ignore" }, + ); + } catch { + // best effort + } + log("boot: starting Metro"); const metroLog = fs.openSync(path.join(ARTIFACTS_DIR, "metro.log"), "a"); const metro = spawn("npm", ["start"], { @@ -310,6 +329,45 @@ async function bootApp(): Promise { ); return; } + // ANR-recovery: when the runner is memory-pressured the system + // shows a "Pixel Launcher isn't responding" dialog over our app. + // The Expensify activity stays in the foreground (per appstate) + // but the accessibility tree is captured by the dialog overlay, + // so SignIn is hidden until the dialog is dismissed. Press + // "Wait", then force-relaunch our activity via `am start` to + // ensure we're on top regardless of what the launcher is doing. + if (isAnrDialog(snap)) { + log("boot: ANR dialog detected — dismissing and relaunching app"); + try { + const waitBtn = snap.nodes.find( + (n) => n.kind === "button" && n.text?.toLowerCase() === "wait", + ); + if (waitBtn) { + adCli.press(waitBtn.ref); + } + } catch (e) { + log(`boot: dismiss press failed: ${(e as Error).message.slice(0, 80)}`); + } + try { + execFileSync( + "adb", + [ + "shell", + "am", + "start", + "-n", + `${APP_PACKAGE}/com.expensify.chat.MainActivity`, + ], + { timeout: 10_000, stdio: "ignore" }, + ); + } catch (e) { + log(`boot: am start failed: ${(e as Error).message.slice(0, 80)}`); + } + // Skip this iteration's normal sleep; recheck immediately so + // we don't waste 6s waiting on a known-stale state. + await sleep(2_000); + continue; + } if (Date.now() - lastProbeAt >= BOOT_PROBE_INTERVAL_MS) { const elapsed = Math.round((Date.now() - start) / 1000); fs.writeFileSync( @@ -344,6 +402,23 @@ async function bootApp(): Promise { fail(`SignIn UI not ready within ${SIGNIN_LOAD_TIMEOUT_MS / 1000}s`); } +/** + * Detects the Android system "isn't responding" dialog. The exact + * label varies (Pixel Launcher / com.android.systemui / etc.) so we + * match on the structural fingerprint: exactly two button nodes + * labelled "Close app" and "Wait". + */ +function isAnrDialog(snap: { + nodes: Array<{ kind: string; text?: string }>; +}): boolean { + const buttons = snap.nodes.filter((n) => n.kind === "button"); + if (buttons.length !== 2) { + return false; + } + const labels = buttons.map((b) => b.text?.toLowerCase() ?? "").sort(); + return labels[0] === "close app" && labels[1] === "wait"; +} + async function waitForMetro(): Promise { const start = Date.now(); while (Date.now() - start < METRO_READY_TIMEOUT_MS) { From 2a959e3b33192d46b128f8f6623cba047e235baa Mon Sep 17 00:00:00 2001 From: rustam-callstack <192424666+rustam-callstack@users.noreply.github.com> Date: Fri, 8 May 2026 14:39:53 +0200 Subject: [PATCH 04/14] smoke(llm): seed replay cache from first successful fork-test run Captured from run 25553622590 (51m, 4/4 LLM steps green, magic-code reached). All 4 step entries plus structural pre/post signatures are committed so future PR runs can replay the happy path without a Claude API call. Known caveat: step 2's recorded actions only contain `press` though the field gets typed end-to-end. The runner's recording path drops the fill action somewhere; the committed cache will not perfectly replay step 2, so cache-hit will fail expect-verification on that step and fall through to LLM. Tracking a fix; the smoke remains correct because expect runs against the live UI, not the cache. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../smoke/cache/android-signin.testcase.json | 73 +++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 tests/smoke/cache/android-signin.testcase.json diff --git a/tests/smoke/cache/android-signin.testcase.json b/tests/smoke/cache/android-signin.testcase.json new file mode 100644 index 000000000000..17c9f760c17d --- /dev/null +++ b/tests/smoke/cache/android-signin.testcase.json @@ -0,0 +1,73 @@ +{ + "version": 1, + "model": "claude-sonnet-4-6", + "testCaseHash": "377c89ecd3182b95", + "steps": [ + { + "stepNumber": 1, + "stepTextHash": "eefe04289ed44849", + "preSignature": "7a4382df9ac727f2", + "postSignature": "7a4382df9ac727f2", + "actions": [], + "expect": "snapshot.contains_text(\"Phone or email\")", + "recordedAt": "2026-05-08T12:28:40.823Z", + "runId": "25553622590" + }, + { + "stepNumber": 2, + "stepTextHash": "988a4a6e077e6dc6", + "preSignature": "7a4382df9ac727f2", + "postSignature": "5303106f4b995b99", + "actions": [ + { + "tool": "press", + "locator": { + "kind": "text-field", + "index": 0, + "editable": true + } + } + ], + "expect": "snapshot.field_with_text(\"rustam.zeinalov@callstack.com\").exists", + "recordedAt": "2026-05-08T12:29:43.097Z", + "runId": "25553622590" + }, + { + "stepNumber": 3, + "stepTextHash": "a071b334a6b8c0f4", + "preSignature": "5303106f4b995b99", + "postSignature": "0235eea0848df345", + "actions": [ + { + "tool": "press", + "locator": { + "kind": "button", + "index": 0 + } + }, + { + "tool": "dismiss_keyboard" + } + ], + "expect": "appstate.foreground == \"com.expensify.chat.dev\"", + "recordedAt": "2026-05-08T12:30:04.546Z", + "runId": "25553622590" + }, + { + "stepNumber": 4, + "stepTextHash": "a1059919be2f42c9", + "preSignature": "0235eea0848df345", + "postSignature": "0235eea0848df345", + "actions": [ + { + "tool": "wait_for", + "predicate": "snapshot.contains_text(\"Magic code\")", + "timeoutMs": 60000 + } + ], + "expect": "snapshot.contains_text(\"Magic code\")", + "recordedAt": "2026-05-08T12:30:21.603Z", + "runId": "25553622590" + } + ] +} From 5584de430f0ec90878c75b9d9562f9d09344af3d Mon Sep 17 00:00:00 2001 From: rustam-callstack <192424666+rustam-callstack@users.noreply.github.com> Date: Fri, 8 May 2026 15:29:09 +0200 Subject: [PATCH 05/14] smoke(llm): treat expect-pass as cache-hit success even on signature drift MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The cache-hit run 25556053751 failed at step 3 because verifyPostState required *both* signature match AND expect-pass. The replay had pressed Continue successfully — the app advanced to magic-code, but the post- signature differed from what was recorded (cosmetic re-render, slightly different node count). Runner treated it as drift, fell through to LLM, LLM exhausted budget, bash fallback ran "press Continue" against the magic-code screen (no Continue button), step failed. The signature is a structural hash; the expect predicate is an intentional deterministic check over the live UI. When expect passes the step has succeeded by the test author's own definition, even if the structural hash drifted. Re-prioritize: expect first, signature becomes advisory (warning, not failure). Steps with no `expect:` clause still fail on signature drift — that's the only post-state check available there, and it stays useful as a "did anything visibly change?" tripwire. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/scripts/agent-device-llm-driver.ts | 27 +++++++++++++++++----- 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/.github/scripts/agent-device-llm-driver.ts b/.github/scripts/agent-device-llm-driver.ts index e74cb9f73c3a..6d949eb03851 100644 --- a/.github/scripts/agent-device-llm-driver.ts +++ b/.github/scripts/agent-device-llm-driver.ts @@ -563,17 +563,32 @@ async function verifyPostState( ): Promise<{ ok: true; snap: Snapshot } | { ok: false; reason: string }> { const snap = adCli.snapshot(); const app = adCli.appstate(); - if (expectedSignature && snapshotSignature(snap) !== expectedSignature) { - return { - ok: false, - reason: `post-state signature drift (recorded ${expectedSignature}, observed ${snapshotSignature(snap)})`, - }; - } + + // Expect (when declared) is the source of truth: it's a deterministic + // predicate over the live UI, while the post-signature is a structural + // hash that can drift on cosmetic re-renders, animation timing, or + // node-ordering changes that don't affect what the user actually sees. + // If expect passes, the step succeeded — drift becomes advisory. if (step.expect) { const ev = evaluateExpect(step.expect, snap, app); if (!ev.ok) { return { ok: false, reason: `expect failed: ${ev.reason}` }; } + if (expectedSignature && snapshotSignature(snap) !== expectedSignature) { + log( + `::warning::post-signature drift but expect passed (recorded ${expectedSignature}, observed ${snapshotSignature(snap)}) — accepting`, + ); + } + return { ok: true, snap }; + } + + // No expect declared — fall back to signature equality so a cache-hit + // path still has *some* post-state check. + if (expectedSignature && snapshotSignature(snap) !== expectedSignature) { + return { + ok: false, + reason: `post-state signature drift (recorded ${expectedSignature}, observed ${snapshotSignature(snap)})`, + }; } return { ok: true, snap }; } From 40b7a09f06c2d413eabdcd75bc5653de4c0a014d Mon Sep 17 00:00:00 2001 From: rustam-callstack <192424666+rustam-callstack@users.noreply.github.com> Date: Fri, 8 May 2026 16:22:04 +0200 Subject: [PATCH 06/14] smoke(llm): DEBUG_LLM=1 emits full LLM-call + dispatch trace Diagnostic mode for tracking down the step-2 cache-recording bug (cache stores `press(text-field)` even though the email gets typed into the field). With DEBUG_LLM=1: - llm-client trace adds a `request` entry per call with the last user text + every prior tool_use in the thread. Each `response` entry now includes the LLM's tool_use blocks (id, name, full input args) and any text preview. - driver dispatchTool fill/press log entry args, refToLocator result, executed-array length after the push, and surface throws separately so a silent CLI failure becomes visible. Off by default (env-gated) so production runs stay slim. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/scripts/agent-device-llm-client.ts | 50 +++++++++++++++++++++- .github/scripts/agent-device-llm-driver.ts | 45 ++++++++++++++++++- 2 files changed, 91 insertions(+), 4 deletions(-) diff --git a/.github/scripts/agent-device-llm-client.ts b/.github/scripts/agent-device-llm-client.ts index 72e2e2816cb1..9e60b5dd201a 100644 --- a/.github/scripts/agent-device-llm-client.ts +++ b/.github/scripts/agent-device-llm-client.ts @@ -138,17 +138,63 @@ export class AnthropicClient { messages: args.messages, }; + // Verbose diagnostic mode: capture the full message thread + tool_use + // calls in the trace. Trade-off is artifact size and a small risk + // of leaking content the user typed; disabled unless DEBUG_LLM=1. + const verbose = (process.env.DEBUG_LLM ?? "") === "1"; + if (verbose) { + const lastUser = args.messages + .slice() + .reverse() + .find((m) => m.role === "user"); + const lastText = lastUser?.content.find( + (c): c is { type: "text"; text: string } => c.type === "text", + ); + this.opts.traceWriter?.({ + type: "request", + message_count: args.messages.length, + last_user_text: lastText?.text.slice(0, 1500) ?? null, + tool_uses_in_thread: args.messages.flatMap((m) => + m.content + .filter( + ( + c, + ): c is { + type: "tool_use"; + id: string; + name: string; + input: Record; + } => c.type === "tool_use", + ) + .map((c) => ({ id: c.id, name: c.name, input: c.input })), + ), + }); + } + let lastError: Error | undefined; for (let attempt = 0; attempt <= RETRY_DELAYS_MS.length; attempt++) { try { const response = await this.callOnce(body); this.accountForUsage(response.usage); - this.opts.traceWriter?.({ + const baseEntry = { type: "response", attempt, stop_reason: response.stop_reason, usage: response.usage, - }); + } as Record; + if (verbose) { + baseEntry.tool_uses = response.content + .filter((c) => c.type === "tool_use") + .map((c) => ({ + id: (c as { id: string }).id, + name: (c as { name: string }).name, + input: (c as { input: unknown }).input, + })); + baseEntry.text_preview = response.content + .filter((c) => c.type === "text") + .map((c) => (c as { text: string }).text.slice(0, 800)); + } + this.opts.traceWriter?.(baseEntry); return response; } catch (e) { lastError = e as Error; diff --git a/.github/scripts/agent-device-llm-driver.ts b/.github/scripts/agent-device-llm-driver.ts index 6d949eb03851..39723d98c030 100644 --- a/.github/scripts/agent-device-llm-driver.ts +++ b/.github/scripts/agent-device-llm-driver.ts @@ -77,6 +77,11 @@ const STEP_WALL_CLOCK_BUDGET_MS = 60_000; const MAX_STATE_CHANGING_ACTIONS = 4; const SCREENSHOT_BUDGET_PER_RUN = 2; const TEXT_LENGTH_CAP = 200; +// DEBUG_LLM=1 makes both the LLM client (request/response bodies) +// and the runner (per-tool-dispatch entries) emit verbose entries to +// llm-trace.jsonl + stdout. Off by default to keep normal-run +// artifacts and CI stdout slim. +const DEBUG_LLM = process.env.DEBUG_LLM === "1"; // ---- types ------------------------------------------------------------ @@ -1107,14 +1112,33 @@ async function dispatchTool( const node = ctx.snap.nodes.find((n) => n.ref === ref); if (!node) { ctx.onPhantom(); + if (DEBUG_LLM) { + log( + `::debug::dispatch.fill phantom ref=${ref} text="${text.slice(0, 30)}…"`, + ); + } return { content: `phantom ref ${ref} not in current snapshot`, isError: true, }; } ctx.resetPhantom(); - adCli.fill(ref, text); + try { + adCli.fill(ref, text); + } catch (e) { + if (DEBUG_LLM) { + log( + `::debug::dispatch.fill THREW ref=${ref} text="${text.slice(0, 30)}…" err=${(e as Error).message.slice(0, 100)}`, + ); + } + throw e; + } const loc = refToLocator(ctx.snap, ref); + if (DEBUG_LLM) { + log( + `::debug::dispatch.fill ok ref=${ref} kind=${node.kind} loc=${JSON.stringify(loc)} text="${text.slice(0, 30)}…" executed_len_after=${ctx.executed.length + (loc ? 1 : 0)}`, + ); + } if (loc) { ctx.executed.push({ tool: "fill", locator: loc, text, ref }); } @@ -1125,14 +1149,31 @@ async function dispatchTool( const node = ctx.snap.nodes.find((n) => n.ref === ref); if (!node) { ctx.onPhantom(); + if (DEBUG_LLM) { + log(`::debug::dispatch.press phantom ref=${ref}`); + } return { content: `phantom ref ${ref} not in current snapshot`, isError: true, }; } ctx.resetPhantom(); - adCli.press(ref); + try { + adCli.press(ref); + } catch (e) { + if (DEBUG_LLM) { + log( + `::debug::dispatch.press THREW ref=${ref} err=${(e as Error).message.slice(0, 100)}`, + ); + } + throw e; + } const loc = refToLocator(ctx.snap, ref); + if (DEBUG_LLM) { + log( + `::debug::dispatch.press ok ref=${ref} kind=${node.kind} loc=${JSON.stringify(loc)} executed_len_after=${ctx.executed.length + (loc ? 1 : 0)}`, + ); + } if (loc) { ctx.executed.push({ tool: "press", locator: loc, ref }); } From 55a0a211df21dd57319166178adae2a7a4b3d2ee Mon Sep 17 00:00:00 2001 From: rustam-callstack <192424666+rustam-callstack@users.noreply.github.com> Date: Fri, 8 May 2026 19:06:12 +0200 Subject: [PATCH 07/14] smoke(llm): disable Android autofill + force-stop on ANR recovery MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two fixes for cache-hit reliability traced from run 25553622590's logcat: 1. Android autofill silently filled the email field after the LLM pressed it (FillRequestEventLogger entry at the exact moment of step 2's press, BeginSignIn API fired with the email a second later — the LLM never called fill). Cache then recorded only the press; replay on a different AVD snapshot where autofill state had rotated broke deterministically. Disabling autofill via `settings put secure autofill_service null` at boot forces the LLM to call fill explicitly so both record and replay are self-contained. 2. ANR recovery via `am start` brought a half-loaded MainActivity to the foreground (run 25560886459 stuck on splash for 600s after recovery). force-stop + agent-device open --relaunch guarantees a clean process spawn so the next launch re-runs JS init. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/scripts/agent-device-llm-driver.ts | 68 ++++++++++++++++------ 1 file changed, 51 insertions(+), 17 deletions(-) diff --git a/.github/scripts/agent-device-llm-driver.ts b/.github/scripts/agent-device-llm-driver.ts index 39723d98c030..411a43545d9d 100644 --- a/.github/scripts/agent-device-llm-driver.ts +++ b/.github/scripts/agent-device-llm-driver.ts @@ -268,6 +268,24 @@ async function bootApp(): Promise { // best effort } + // Disable Android Autofill globally. Without this, the framework + // silently populates editable fields (email, password, etc.) when + // they gain focus and a credential is cached on the AVD. That + // makes the LLM appear to "succeed" with just a press call — + // recorded cache misses the actual fill action, and replay on a + // different AVD snapshot (where autofill state has rotated) + // breaks because press alone no longer suffices. Forcing the LLM + // to explicitly fill makes both record and replay deterministic. + try { + execFileSync( + "adb", + ["shell", "settings", "put", "secure", "autofill_service", "null"], + { timeout: 5_000, stdio: "ignore" }, + ); + } catch { + // best effort + } + log("boot: starting Metro"); const metroLog = fs.openSync(path.join(ARTIFACTS_DIR, "metro.log"), "a"); const metro = spawn("npm", ["start"], { @@ -336,13 +354,15 @@ async function bootApp(): Promise { } // ANR-recovery: when the runner is memory-pressured the system // shows a "Pixel Launcher isn't responding" dialog over our app. - // The Expensify activity stays in the foreground (per appstate) - // but the accessibility tree is captured by the dialog overlay, - // so SignIn is hidden until the dialog is dismissed. Press - // "Wait", then force-relaunch our activity via `am start` to - // ensure we're on top regardless of what the launcher is doing. + // Press "Wait" to dismiss, then force-stop + relaunch via + // agent-device. Plain `am start` was insufficient: if the ANR + // hit during JS bundle delivery, MainActivity was in a half- + // initialised state and `am start` just brought that broken + // activity to the foreground (run 25560886459 stuck on splash + // for 600s after recovering from an ANR via am start). Force- + // stop guarantees a clean process spawn for the next launch. if (isAnrDialog(snap)) { - log("boot: ANR dialog detected — dismissing and relaunching app"); + log("boot: ANR dialog detected — dismissing and force-relaunching app"); try { const waitBtn = snap.nodes.find( (n) => n.kind === "button" && n.text?.toLowerCase() === "wait", @@ -354,23 +374,37 @@ async function bootApp(): Promise { log(`boot: dismiss press failed: ${(e as Error).message.slice(0, 80)}`); } try { + execFileSync("adb", ["shell", "am", "force-stop", APP_PACKAGE], { + timeout: 5_000, + stdio: "ignore", + }); + } catch (e) { + log(`boot: force-stop failed: ${(e as Error).message.slice(0, 80)}`); + } + try { + const serial = execFileSync("adb", ["get-serialno"], { + encoding: "utf8", + }).trim(); execFileSync( - "adb", + "agent-device", [ - "shell", - "am", - "start", - "-n", - `${APP_PACKAGE}/com.expensify.chat.MainActivity`, + "open", + APP_PACKAGE, + "--platform", + "android", + "--serial", + serial, + "--session", + SESSION, + "--relaunch", ], - { timeout: 10_000, stdio: "ignore" }, + { timeout: 30_000, stdio: "ignore" }, ); } catch (e) { - log(`boot: am start failed: ${(e as Error).message.slice(0, 80)}`); + log(`boot: relaunch failed: ${(e as Error).message.slice(0, 80)}`); } - // Skip this iteration's normal sleep; recheck immediately so - // we don't waste 6s waiting on a known-stale state. - await sleep(2_000); + // Give the process a moment to come back up before re-snapshotting. + await sleep(3_000); continue; } if (Date.now() - lastProbeAt >= BOOT_PROBE_INTERVAL_MS) { From b1e0d3ea2c6db65bc18ed4f6b9e755d95a78fc28 Mon Sep 17 00:00:00 2001 From: rustam-callstack <192424666+rustam-callstack@users.noreply.github.com> Date: Mon, 11 May 2026 10:47:35 +0200 Subject: [PATCH 08/14] smoke(llm): refresh snap after state-changing actions + bump fill timeout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Run 25568731827's LLM trace revealed the core failure pattern: every step-2 user message carried `snapshot.node_count=10` with the same pre-step text-field — `snap` is never refreshed after fill/press, so the LLM sees its own actions had "no effect", retries the same fill, gets caught by seen-hash dedup, then burns the wall-clock budget. Three fixes: 1. After every batch of tool calls in runLLMStep that contains fill/press/wait, refresh `snap` + `app` so the next round sees the live state. snapshot/wait_for/back/dismiss_keyboard already refreshed via dispatchTool's onSnap callback; fill/press didn't. 2. agent-device fill gets its own 90s CLI timeout (was 30s). The 30-char email took >30s to type on the 2-core ubuntu-latest; adCli.fill threw, the action wasn't pushed to executed[], and the device did get partially-typed text but the runner thought the call failed. Read-only commands keep the 30s tripwire. 3. 500ms settle gap after bash fallback before verifyPostState so the typed text propagates through React Native's onChange before the predicate snapshot reads back. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/scripts/agent-device-cli.ts | 12 ++++++--- .github/scripts/agent-device-llm-driver.ts | 31 ++++++++++++++++++++++ 2 files changed, 40 insertions(+), 3 deletions(-) diff --git a/.github/scripts/agent-device-cli.ts b/.github/scripts/agent-device-cli.ts index d062b12f64f5..cd2287b4e0f5 100644 --- a/.github/scripts/agent-device-cli.ts +++ b/.github/scripts/agent-device-cli.ts @@ -46,14 +46,20 @@ export type AppState = { const SESSION = process.env.AGENT_DEVICE_SESSION ?? "ci"; // Bound every CLI invocation so a hung emulator can't wedge the smoke. -// 30s is generous for read-only commands (snapshot/screenshot) and -// effectively a "this should have completed already" tripwire. +// 30s is generous for read-only commands (snapshot/screenshot/appstate). +// `fill` is special: typing a 30-char string into an editable on a +// 2-core ubuntu-latest under load was observed to exceed 30s (the +// CLI partial-typed and exited non-zero on timeout — visible at the +// device level via screenshot but the runner threw before recording +// the action). 90s gives ~3x headroom. const CLI_TIMEOUT_MS = 30_000; +const CLI_FILL_TIMEOUT_MS = 90_000; function run(args: string[]): string { + const timeout = args[0] === "fill" ? CLI_FILL_TIMEOUT_MS : CLI_TIMEOUT_MS; return execFileSync("agent-device", args, { encoding: "utf8", - timeout: CLI_TIMEOUT_MS, + timeout, maxBuffer: 8 * 1024 * 1024, }); } diff --git a/.github/scripts/agent-device-llm-driver.ts b/.github/scripts/agent-device-llm-driver.ts index 411a43545d9d..7c73ea89fcbd 100644 --- a/.github/scripts/agent-device-llm-driver.ts +++ b/.github/scripts/agent-device-llm-driver.ts @@ -565,6 +565,14 @@ async function executeStep( } ctx.stats.onBashRun(); actions = bashResult.actions; + // Settle gap: agent-device fill returns once it has dispatched + // the typing command, but the on-device EditText needs a beat for + // React Native's onChange to fire and the accessibility tree to + // re-publish the new text. Without this, verifyPostState below + // takes a snapshot before the typed text has propagated and the + // expect predicate fails on what's transient lag, not a real + // problem. + await sleep(500); } const post = await verifyPostState(step, null); @@ -999,6 +1007,29 @@ async function runLLMStep( } messages.push({ role: "user", content: toolResults }); + + // Refresh snap + appstate after every batch of tool calls that + // changed device state. Without this the LLM keeps seeing the + // pre-step snapshot even after its fill/press took effect, so + // identical fills get caught by the seen-hash dedup and the LLM + // burns its budget retrying actions it already performed. + // dispatchTool's snapshot/wait_for/back/dismiss callbacks already + // refresh; fill and press do not. + if ( + toolUses.some( + (tu) => tu.name === "fill" || tu.name === "press" || tu.name === "wait", + ) + ) { + try { + snap = adCli.snapshot(); + app = adCli.appstate(); + } catch (e) { + // Transient — next loop iteration will retry implicitly. + log( + `runLLMStep: post-action snap refresh threw (${(e as Error).message.slice(0, 80)}); continuing with stale snap`, + ); + } + } } return { From 24d36da0cb5edc13caa4d2d360bbf0edce0acf1d Mon Sep 17 00:00:00 2001 From: rustam-callstack <192424666+rustam-callstack@users.noreply.github.com> Date: Mon, 11 May 2026 11:38:29 +0200 Subject: [PATCH 09/14] smoke(llm): refresh seed cache from autofill-disabled run MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Captured from run 25659967543 (green end-to-end, all 4 steps reached magic-code with proper LLM tool sequence). This cache supersedes the prior seed from 25553622590, which was recorded with Android Autofill active — its step 2 stored a stale `press(text-field)` action that worked only because the framework was silently filling the field on focus, breaking cache-hit replay on AVD snapshots where autofill state had rotated. This cache contains the correct `fill(text-field, "rustam.zeinalov@…")` recorded against an autofill-disabled emulator. Signatures rotate relative to the old cache (autofill-related accessibility nodes are gone), but the role-based locators stay portable. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../smoke/cache/android-signin.testcase.json | 37 ++++++++++--------- 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/tests/smoke/cache/android-signin.testcase.json b/tests/smoke/cache/android-signin.testcase.json index 17c9f760c17d..52da1ed36046 100644 --- a/tests/smoke/cache/android-signin.testcase.json +++ b/tests/smoke/cache/android-signin.testcase.json @@ -6,37 +6,38 @@ { "stepNumber": 1, "stepTextHash": "eefe04289ed44849", - "preSignature": "7a4382df9ac727f2", - "postSignature": "7a4382df9ac727f2", + "preSignature": "15794371cece6ece", + "postSignature": "15794371cece6ece", "actions": [], "expect": "snapshot.contains_text(\"Phone or email\")", - "recordedAt": "2026-05-08T12:28:40.823Z", - "runId": "25553622590" + "recordedAt": "2026-05-11T09:33:36.152Z", + "runId": "25659967543" }, { "stepNumber": 2, "stepTextHash": "988a4a6e077e6dc6", - "preSignature": "7a4382df9ac727f2", - "postSignature": "5303106f4b995b99", + "preSignature": "15794371cece6ece", + "postSignature": "1416ff7437729bd4", "actions": [ { - "tool": "press", + "tool": "fill", "locator": { "kind": "text-field", "index": 0, "editable": true - } + }, + "text": "rustam.zeinalov@callstack.com" } ], "expect": "snapshot.field_with_text(\"rustam.zeinalov@callstack.com\").exists", - "recordedAt": "2026-05-08T12:29:43.097Z", - "runId": "25553622590" + "recordedAt": "2026-05-11T09:35:09.430Z", + "runId": "25659967543" }, { "stepNumber": 3, "stepTextHash": "a071b334a6b8c0f4", - "preSignature": "5303106f4b995b99", - "postSignature": "0235eea0848df345", + "preSignature": "1416ff7437729bd4", + "postSignature": "2a958e751a6dab98", "actions": [ { "tool": "press", @@ -50,14 +51,14 @@ } ], "expect": "appstate.foreground == \"com.expensify.chat.dev\"", - "recordedAt": "2026-05-08T12:30:04.546Z", - "runId": "25553622590" + "recordedAt": "2026-05-11T09:35:29.243Z", + "runId": "25659967543" }, { "stepNumber": 4, "stepTextHash": "a1059919be2f42c9", - "preSignature": "0235eea0848df345", - "postSignature": "0235eea0848df345", + "preSignature": "2a958e751a6dab98", + "postSignature": "2a958e751a6dab98", "actions": [ { "tool": "wait_for", @@ -66,8 +67,8 @@ } ], "expect": "snapshot.contains_text(\"Magic code\")", - "recordedAt": "2026-05-08T12:30:21.603Z", - "runId": "25553622590" + "recordedAt": "2026-05-11T09:35:42.112Z", + "runId": "25659967543" } ] } From 97f049c5fc9d8e3ae2d9f6ec7c778d7e26fb43fe Mon Sep 17 00:00:00 2001 From: rustam-callstack <192424666+rustam-callstack@users.noreply.github.com> Date: Mon, 11 May 2026 12:27:58 +0200 Subject: [PATCH 10/14] smoke(llm): filter RN dev-warning nodes from snapshot signature MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Compared step-1-pre snapshots of runs 25659967543 and 25662443061 on the same SignIn screen: one had 3 extra dev-warning nodes ("!, The result of getSnapshot should be cached...") the other didn't. Structural signature included those nodes, so the cache key rotated between runs and replay never matched even though the user-visible UI was identical. Drop those transient dev-mode bubbles from the signature: any group whose text starts with "!, ", any "!" indicator, and the specific warning text strings that pair with them. Dev-only by construction — they never reach release builds. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../agent-device-snapshot-signature.ts | 45 ++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/.github/scripts/agent-device-snapshot-signature.ts b/.github/scripts/agent-device-snapshot-signature.ts index d7cead27aa17..c61affd8d10f 100644 --- a/.github/scripts/agent-device-snapshot-signature.ts +++ b/.github/scripts/agent-device-snapshot-signature.ts @@ -32,8 +32,51 @@ function project(node: SnapshotNode): string { ].join("|"); } +/** + * Transient nodes the signature must ignore. + * + * React Native dev-mode renders an inline "!, " bubble for + * runtime warnings (StrictMode, dev-only assertions, etc.). These + * appear and disappear between runs depending on bundler timing and + * warning suppression state — same screen, different node count. + * Runs 25659967543 and 25662443061 produced different signatures on + * an identical SignIn screen because one had 3 extra dev-warning + * nodes the other didn't, and cache replay never landed. + * + * These warnings are dev-only, never reach release builds, and never + * mean anything to a user — exactly the kind of cosmetic node the + * structural signature should disregard. + */ +function isTransientDevWarning(node: SnapshotNode): boolean { + if (!node.text) { + return false; + } + if (node.kind === "group" && node.text.startsWith("!, ")) { + return true; + } + if (node.kind === "text" && node.text === "!") { + return true; + } + if ( + node.kind === "text" && + node.text.startsWith("Open debugger to view warnings") + ) { + return true; + } + if ( + node.kind === "text" && + node.text.startsWith("The result of getSnapshot") + ) { + return true; + } + return false; +} + export function snapshotSignature(snap: Snapshot): string { - const projected = snap.nodes.map(project).join("\n"); + const projected = snap.nodes + .filter((n) => !isTransientDevWarning(n)) + .map(project) + .join("\n"); return createHash("sha256").update(projected).digest("hex").slice(0, 16); } From 308cc289a0c313a3fef5738505479a41f1a591c6 Mon Sep 17 00:00:00 2001 From: rustam-callstack <192424666+rustam-callstack@users.noreply.github.com> Date: Mon, 11 May 2026 12:30:04 +0200 Subject: [PATCH 11/14] smoke(llm): re-sign cache after dev-warning filter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Re-computed pre/post signatures locally from run 25662443061's step-N-{pre,post}.txt artifacts with the new filter (transient RN dev-warning nodes excluded). Verified the same signatures compute from run 25659967543's artifacts on the same UI despite that run having different dev-warning node counts — filter is doing its job. Action sequences unchanged (filter affects only signature, not locator resolution). Next dispatch should land cache_hits>=3. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../smoke/cache/android-signin.testcase.json | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/tests/smoke/cache/android-signin.testcase.json b/tests/smoke/cache/android-signin.testcase.json index 52da1ed36046..4dffe68f81f7 100644 --- a/tests/smoke/cache/android-signin.testcase.json +++ b/tests/smoke/cache/android-signin.testcase.json @@ -6,18 +6,18 @@ { "stepNumber": 1, "stepTextHash": "eefe04289ed44849", - "preSignature": "15794371cece6ece", - "postSignature": "15794371cece6ece", + "preSignature": "bada22fec79afdc7", + "postSignature": "bada22fec79afdc7", "actions": [], "expect": "snapshot.contains_text(\"Phone or email\")", - "recordedAt": "2026-05-11T09:33:36.152Z", + "recordedAt": "2026-05-11T10:29:23.672Z", "runId": "25659967543" }, { "stepNumber": 2, "stepTextHash": "988a4a6e077e6dc6", - "preSignature": "15794371cece6ece", - "postSignature": "1416ff7437729bd4", + "preSignature": "bada22fec79afdc7", + "postSignature": "04ba1966c1ae1c5f", "actions": [ { "tool": "fill", @@ -30,14 +30,14 @@ } ], "expect": "snapshot.field_with_text(\"rustam.zeinalov@callstack.com\").exists", - "recordedAt": "2026-05-11T09:35:09.430Z", + "recordedAt": "2026-05-11T10:29:23.673Z", "runId": "25659967543" }, { "stepNumber": 3, "stepTextHash": "a071b334a6b8c0f4", - "preSignature": "1416ff7437729bd4", - "postSignature": "2a958e751a6dab98", + "preSignature": "04ba1966c1ae1c5f", + "postSignature": "33d1e5d0787b275a", "actions": [ { "tool": "press", @@ -51,14 +51,14 @@ } ], "expect": "appstate.foreground == \"com.expensify.chat.dev\"", - "recordedAt": "2026-05-11T09:35:29.243Z", + "recordedAt": "2026-05-11T10:29:23.673Z", "runId": "25659967543" }, { "stepNumber": 4, "stepTextHash": "a1059919be2f42c9", - "preSignature": "2a958e751a6dab98", - "postSignature": "2a958e751a6dab98", + "preSignature": "33d1e5d0787b275a", + "postSignature": "33d1e5d0787b275a", "actions": [ { "tool": "wait_for", @@ -67,7 +67,7 @@ } ], "expect": "snapshot.contains_text(\"Magic code\")", - "recordedAt": "2026-05-11T09:35:42.112Z", + "recordedAt": "2026-05-11T10:29:23.673Z", "runId": "25659967543" } ] From ccda58f16e743ea6444444eab3bbd8c5e78843fa Mon Sep 17 00:00:00 2001 From: Rustam Zeinalov Date: Mon, 11 May 2026 16:50:42 +0200 Subject: [PATCH 12/14] smoke(llm): convert TS line comments to block comments Consolidates consecutive `//` lines into single `/* */` blocks across all six engine files. Single-line and end-of-line comments become inline `/* */`. ESLint/TS/Prettier directives are preserved as `//` because they only work in that form. Strict tsc still clean. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/scripts/agent-device-cli.ts | 62 ++-- .github/scripts/agent-device-expect.ts | 36 ++- .github/scripts/agent-device-llm-client.ts | 62 ++-- .github/scripts/agent-device-llm-driver.ts | 302 ++++++++++-------- .github/scripts/agent-device-replay-cache.ts | 26 +- .../agent-device-snapshot-signature.ts | 42 +-- 6 files changed, 292 insertions(+), 238 deletions(-) diff --git a/.github/scripts/agent-device-cli.ts b/.github/scripts/agent-device-cli.ts index cd2287b4e0f5..d276227afbd5 100644 --- a/.github/scripts/agent-device-cli.ts +++ b/.github/scripts/agent-device-cli.ts @@ -1,16 +1,18 @@ -// Thin TypeScript wrapper around the `agent-device` CLI. -// -// Why this exists: the CLI emits accessibility-tree snapshots as -// human-readable text (`@e4 [text-field] "Phone or email," [editable]`). -// That format is fine for humans grepping artifacts but bad for an LLM -// because: -// 1. The LLM has to re-tokenize the structure on every turn — wasteful. -// 2. Subtle whitespace/quoting differences across platforms (Android's -// trailing comma vs iOS's no comma) leak into the LLM's reasoning. -// 3. Phantom hallucinated refs are harder to detect against free text. -// -// We parse once here, hand the LLM a typed JSON array, and keep the raw -// text in the artifact for post-mortem. +/* + * Thin TypeScript wrapper around the `agent-device` CLI. + * + * Why this exists: the CLI emits accessibility-tree snapshots as + * human-readable text (`@e4 [text-field] "Phone or email," [editable]`). + * That format is fine for humans grepping artifacts but bad for an LLM + * because: + * 1. The LLM has to re-tokenize the structure on every turn — wasteful. + * 2. Subtle whitespace/quoting differences across platforms (Android's + * trailing comma vs iOS's no comma) leak into the LLM's reasoning. + * 3. Phantom hallucinated refs are harder to detect against free text. + * + * We parse once here, hand the LLM a typed JSON array, and keep the raw + * text in the artifact for post-mortem. + */ import { execFileSync } from "child_process"; @@ -45,13 +47,15 @@ export type AppState = { const SESSION = process.env.AGENT_DEVICE_SESSION ?? "ci"; -// Bound every CLI invocation so a hung emulator can't wedge the smoke. -// 30s is generous for read-only commands (snapshot/screenshot/appstate). -// `fill` is special: typing a 30-char string into an editable on a -// 2-core ubuntu-latest under load was observed to exceed 30s (the -// CLI partial-typed and exited non-zero on timeout — visible at the -// device level via screenshot but the runner threw before recording -// the action). 90s gives ~3x headroom. +/* + * Bound every CLI invocation so a hung emulator can't wedge the smoke. + * 30s is generous for read-only commands (snapshot/screenshot/appstate). + * `fill` is special: typing a 30-char string into an editable on a + * 2-core ubuntu-latest under load was observed to exceed 30s (the + * CLI partial-typed and exited non-zero on timeout — visible at the + * device level via screenshot but the runner threw before recording + * the action). 90s gives ~3x headroom. + */ const CLI_TIMEOUT_MS = 30_000; const CLI_FILL_TIMEOUT_MS = 90_000; @@ -147,7 +151,7 @@ export function parseAppState(raw: string): AppState { return { foregroundApp: fg?.[1], activity: act?.[1], raw }; } -// ---- public surface used by the runner ------------------------------- +/* ---- public surface used by the runner ------------------------------- */ export function snapshot(): Snapshot { return parseSnapshot(run(["snapshot", "-i", "--session", SESSION])); @@ -155,8 +159,10 @@ export function snapshot(): Snapshot { export function screenshotBase64(path: string): string { run(["screenshot", path, "--session", SESSION]); - // The CLI writes to disk; the runner reads + base64-encodes itself - // (we keep this wrapper free of fs to keep the signatures simple). + /* + * The CLI writes to disk; the runner reads + base64-encodes itself + * (we keep this wrapper free of fs to keep the signatures simple). + */ return path; } @@ -173,14 +179,16 @@ export function press(ref: string): void { } export function closeSession(): void { - // Idempotent — if there's no session, this is a no-op. + /* Idempotent — if there's no session, this is a no-op. */ tryRun(["close", "--session", SESSION]); } export function adbKey(keyEvent: number): void { - // Used by the LLM's `back()` and `dismiss_keyboard()` tools. We - // shell out to adb directly rather than agent-device because the - // CLI doesn't expose a keyevent primitive. + /* + * Used by the LLM's `back()` and `dismiss_keyboard()` tools. We + * shell out to adb directly rather than agent-device because the + * CLI doesn't expose a keyevent primitive. + */ execFileSync("adb", ["shell", "input", "keyevent", String(keyEvent)], { timeout: CLI_TIMEOUT_MS, encoding: "utf8", diff --git a/.github/scripts/agent-device-expect.ts b/.github/scripts/agent-device-expect.ts index 95754e89fe49..b50b8d97de86 100644 --- a/.github/scripts/agent-device-expect.ts +++ b/.github/scripts/agent-device-expect.ts @@ -1,20 +1,22 @@ -// `expect:` DSL — machine-checked postcondition for each test step. -// -// Why a tiny DSL instead of letting the LLM self-report success: -// `step_complete(rationale)` is an LLM claim, not evidence. A canary -// that trusts an LLM's claim is a canary the LLM can lie to. The -// `expect:` clause is evaluated by deterministic TypeScript code -// against the post-state snapshot/appstate. The step fails red if -// `expect:` fails, regardless of what the LLM said. -// -// Grammar (intentionally small — extend only when a real test step -// can't be expressed): -// snapshot.contains_text("...") -// snapshot.field_with_text("...").exists -// appstate.foreground == "..." -// -// String literal: double-quoted, backslash-escapable. No interpolation, -// no regex, no boolean ops. If a step needs more, write a second step. +/* + * `expect:` DSL — machine-checked postcondition for each test step. + * + * Why a tiny DSL instead of letting the LLM self-report success: + * `step_complete(rationale)` is an LLM claim, not evidence. A canary + * that trusts an LLM's claim is a canary the LLM can lie to. The + * `expect:` clause is evaluated by deterministic TypeScript code + * against the post-state snapshot/appstate. The step fails red if + * `expect:` fails, regardless of what the LLM said. + * + * Grammar (intentionally small — extend only when a real test step + * can't be expressed): + * snapshot.contains_text("...") + * snapshot.field_with_text("...").exists + * appstate.foreground == "..." + * + * String literal: double-quoted, backslash-escapable. No interpolation, + * no regex, no boolean ops. If a step needs more, write a second step. + */ import type { AppState, Snapshot } from "./agent-device-cli"; diff --git a/.github/scripts/agent-device-llm-client.ts b/.github/scripts/agent-device-llm-client.ts index 9e60b5dd201a..96d449d9b63f 100644 --- a/.github/scripts/agent-device-llm-client.ts +++ b/.github/scripts/agent-device-llm-client.ts @@ -1,20 +1,22 @@ -// Thin client for the Anthropic /v1/messages endpoint. -// -// Decisions baked in: -// - Direct `fetch` instead of `@anthropic-ai/sdk` to avoid a new -// dependency on a CI-only path. Node 20 has fetch built in. -// - Prompt caching (`cache_control: {type: "ephemeral"}`) on the -// system message and the last tool definition. The system + tool -// surface is static across the run, so cache hit rate after step 1 -// is ~100%, cutting per-call cost by 5-10x. The 5-minute TTL fits -// a single CI run with margin. -// - Bounded exponential backoff with jitter for 429/500/502/503/529. -// The runner's caller decides what to do on final failure (typically -// fall back to a deterministic bash-style assertion); this client -// never silently degrades. -// - Token budget kill-switch: total input+output tokens accumulated -// across the run; throw if exceeded. Bounds runaway spend if a -// prompt or tool design accidentally explodes context. +/* + * Thin client for the Anthropic /v1/messages endpoint. + * + * Decisions baked in: + * - Direct `fetch` instead of `@anthropic-ai/sdk` to avoid a new + * dependency on a CI-only path. Node 20 has fetch built in. + * - Prompt caching (`cache_control: {type: "ephemeral"}`) on the + * system message and the last tool definition. The system + tool + * surface is static across the run, so cache hit rate after step 1 + * is ~100%, cutting per-call cost by 5-10x. The 5-minute TTL fits + * a single CI run with margin. + * - Bounded exponential backoff with jitter for 429/500/502/503/529. + * The runner's caller decides what to do on final failure (typically + * fall back to a deterministic bash-style assertion); this client + * never silently degrades. + * - Token budget kill-switch: total input+output tokens accumulated + * across the run; throw if exceeded. Bounds runaway spend if a + * prompt or tool design accidentally explodes context. + */ export type AnthropicTool = { name: string; @@ -113,10 +115,12 @@ export class AnthropicClient { messages: AnthropicMessage[]; maxTokens?: number; }): Promise { - // Mark system + last tool as cacheable. Anthropic caches the - // contiguous prefix UP TO each `cache_control` marker, so two - // markers means "cache through end of system" and "cache - // through end of tools" as separate cached prefixes. + /* + * Mark system + last tool as cacheable. Anthropic caches the + * contiguous prefix UP TO each `cache_control` marker, so two + * markers means "cache through end of system" and "cache + * through end of tools" as separate cached prefixes. + */ const cachedTools = args.tools.map((t, i) => i === args.tools.length - 1 ? { ...t, cache_control: { type: "ephemeral" as const } } @@ -138,9 +142,11 @@ export class AnthropicClient { messages: args.messages, }; - // Verbose diagnostic mode: capture the full message thread + tool_use - // calls in the trace. Trade-off is artifact size and a small risk - // of leaking content the user typed; disabled unless DEBUG_LLM=1. + /* + * Verbose diagnostic mode: capture the full message thread + tool_use + * calls in the trace. Trade-off is artifact size and a small risk + * of leaking content the user typed; disabled unless DEBUG_LLM=1. + */ const verbose = (process.env.DEBUG_LLM ?? "") === "1"; if (verbose) { const lastUser = args.messages @@ -239,9 +245,11 @@ export class AnthropicClient { } private accountForUsage(usage: AnthropicResponse["usage"]): void { - // Cache reads cost roughly 10% of normal input tokens, but for - // budget-protection purposes we count them at face value — - // budgets are about runaway prompt design, not pricing. + /* + * Cache reads cost roughly 10% of normal input tokens, but for + * budget-protection purposes we count them at face value — + * budgets are about runaway prompt design, not pricing. + */ this.tokensUsed += usage.input_tokens + usage.output_tokens; if (this.tokensUsed > this.opts.tokenBudget) { throw new TokenBudgetExceededError( diff --git a/.github/scripts/agent-device-llm-driver.ts b/.github/scripts/agent-device-llm-driver.ts index 7c73ea89fcbd..54a445a939d5 100644 --- a/.github/scripts/agent-device-llm-driver.ts +++ b/.github/scripts/agent-device-llm-driver.ts @@ -1,35 +1,37 @@ -// Phase-1 LLM-driven Android smoke runner. -// -// Lifecycle inside the workflow's emulator-runner `script:` block: -// -// 1. Boot dance (deterministic, NOT LLM-driven): -// - close any stale agent-device session -// - locate dev APK from android/app/build/outputs/... -// - adb install -// - adb reverse tcp:8081 tcp:8081 (Metro reachable from emulator) -// - npm start & (Metro background) -// - poll /status until packager-status:running -// - agent-device open --relaunch (cold start) -// -// 2. Test-case execution: -// - parse test case (numbered steps + optional `expect:` lines) -// - per step: cache-first / LLM-fallback / bash-fallback ladder -// - assert post-state via `expect:` evaluator -// - write artifacts (screenshots, snapshots, llm-trace, cache-diff) -// -// 3. Cleanup (always — even on signal/error): -// - dump logcat once -// - close agent-device session (so re-runs aren't tripped by the -// "session already bound" guard) -// - kill background jobs (Metro) -// -// Why a TS runner instead of Python or Bash: -// - The repo already runs ts-node in CI (precedent: createDocsRoutes.ts). -// - Reusing the snapshot parser + signature + expect DSL across -// replay / LLM / bash paths means one source of truth for what -// "the SignIn screen is on screen" means — a divergence between -// "what bash sees" and "what the LLM sees" would be a class of -// bugs we don't want. +/* + * Phase-1 LLM-driven Android smoke runner. + * + * Lifecycle inside the workflow's emulator-runner `script:` block: + * + * 1. Boot dance (deterministic, NOT LLM-driven): + * - close any stale agent-device session + * - locate dev APK from android/app/build/outputs/... + * - adb install + * - adb reverse tcp:8081 tcp:8081 (Metro reachable from emulator) + * - npm start & (Metro background) + * - poll /status until packager-status:running + * - agent-device open --relaunch (cold start) + * + * 2. Test-case execution: + * - parse test case (numbered steps + optional `expect:` lines) + * - per step: cache-first / LLM-fallback / bash-fallback ladder + * - assert post-state via `expect:` evaluator + * - write artifacts (screenshots, snapshots, llm-trace, cache-diff) + * + * 3. Cleanup (always — even on signal/error): + * - dump logcat once + * - close agent-device session (so re-runs aren't tripped by the + * "session already bound" guard) + * - kill background jobs (Metro) + * + * Why a TS runner instead of Python or Bash: + * - The repo already runs ts-node in CI (precedent: createDocsRoutes.ts). + * - Reusing the snapshot parser + signature + expect DSL across + * replay / LLM / bash paths means one source of truth for what + * "the SignIn screen is on screen" means — a divergence between + * "what bash sees" and "what the LLM sees" would be a class of + * bugs we don't want. + */ import { execFileSync, spawn } from "child_process"; import fs from "fs"; @@ -54,7 +56,7 @@ import type { AnthropicMessage, } from "./agent-device-llm-client"; -// ---- config ----------------------------------------------------------- +/* ---- config ----------------------------------------------------------- */ const MODEL = process.env.ANTHROPIC_MODEL ?? "claude-sonnet-4-6"; const TOKEN_BUDGET = Number(process.env.LLM_TOKEN_BUDGET ?? 200_000); @@ -67,23 +69,27 @@ const CACHE_PATH = process.env.LLM_CACHE_PATH ?? deriveCachePath(TEST_CASE_PATH); const APK_GLOB = "android/app/build/outputs/apk/development/debug"; const METRO_READY_TIMEOUT_MS = 120_000; -// 600s gives ~2× margin over Phase 0's observed 294s (warm AVD). The -// first run on a fresh AVD-cache key is closer to a cold boot since -// the prime+run happens in two separate emulator-runner invocations -// and the snapshot-load overhead lands inside this budget. +/* + * 600s gives ~2× margin over Phase 0's observed 294s (warm AVD). The + * first run on a fresh AVD-cache key is closer to a cold boot since + * the prime+run happens in two separate emulator-runner invocations + * and the snapshot-load overhead lands inside this budget. + */ const SIGNIN_LOAD_TIMEOUT_MS = 600_000; const BOOT_PROBE_INTERVAL_MS = 30_000; const STEP_WALL_CLOCK_BUDGET_MS = 60_000; const MAX_STATE_CHANGING_ACTIONS = 4; const SCREENSHOT_BUDGET_PER_RUN = 2; const TEXT_LENGTH_CAP = 200; -// DEBUG_LLM=1 makes both the LLM client (request/response bodies) -// and the runner (per-tool-dispatch entries) emit verbose entries to -// llm-trace.jsonl + stdout. Off by default to keep normal-run -// artifacts and CI stdout slim. +/* + * DEBUG_LLM=1 makes both the LLM client (request/response bodies) + * and the runner (per-tool-dispatch entries) emit verbose entries to + * llm-trace.jsonl + stdout. Off by default to keep normal-run + * artifacts and CI stdout slim. + */ const DEBUG_LLM = process.env.DEBUG_LLM === "1"; -// ---- types ------------------------------------------------------------ +/* ---- types ------------------------------------------------------------ */ type Step = { number: number; @@ -108,7 +114,7 @@ type ContentBlock = type ExecutedAction = CachedAction & { ref?: string }; -// ---- entry point ------------------------------------------------------ +/* ---- entry point ------------------------------------------------------ */ async function main(): Promise { fs.mkdirSync(ARTIFACTS_DIR, { recursive: true }); @@ -175,9 +181,11 @@ async function main(): Promise { } } - // Always write the recorded cache diff, even if it's identical. - // Reviewers want to see a clean (no-op) diff to know the canary - // ran end-to-end without UI drift. + /* + * Always write the recorded cache diff, even if it's identical. + * Reviewers want to see a clean (no-op) diff to know the canary + * ran end-to-end without UI drift. + */ const diffText = cache.diff(committed, recorded); fs.writeFileSync( path.join(ARTIFACTS_DIR, "cache-diff.txt"), @@ -196,7 +204,7 @@ async function main(): Promise { } } -// ---- test case parser ------------------------------------------------- +/* ---- test case parser ------------------------------------------------- */ function parseTestCase(raw: string): Step[] { const steps: Step[] = []; @@ -226,7 +234,7 @@ function parseTestCase(raw: string): Step[] { return steps; } -// ---- boot dance (matches Phase 0's bash) ------------------------------ +/* ---- boot dance (matches Phase 0's bash) ------------------------------ */ async function bootApp(): Promise { log("boot: closing stale session"); @@ -249,15 +257,17 @@ async function bootApp(): Promise { stdio: "inherit", }); - // Pre-emptive ANR suppression. On the 2-core ubuntu-latest runner - // the Pixel Launcher routinely ANRs under the combined load of - // Metro + APK launch + agent-device. The system normally shows a - // blocking "isn't responding" dialog that hides our app behind it. - // Setting hide_error_dialogs=1 makes the OS suppress those dialogs - // (the underlying ANR still happens but the foreground app keeps - // running uncovered). Best-effort — if the property doesn't exist - // on this Android version, fall through and let the in-loop - // recovery handle it. + /* + * Pre-emptive ANR suppression. On the 2-core ubuntu-latest runner + * the Pixel Launcher routinely ANRs under the combined load of + * Metro + APK launch + agent-device. The system normally shows a + * blocking "isn't responding" dialog that hides our app behind it. + * Setting hide_error_dialogs=1 makes the OS suppress those dialogs + * (the underlying ANR still happens but the foreground app keeps + * running uncovered). Best-effort — if the property doesn't exist + * on this Android version, fall through and let the in-loop + * recovery handle it. + */ try { execFileSync( "adb", @@ -265,17 +275,19 @@ async function bootApp(): Promise { { timeout: 5_000, stdio: "ignore" }, ); } catch { - // best effort + /* best effort */ } - // Disable Android Autofill globally. Without this, the framework - // silently populates editable fields (email, password, etc.) when - // they gain focus and a credential is cached on the AVD. That - // makes the LLM appear to "succeed" with just a press call — - // recorded cache misses the actual fill action, and replay on a - // different AVD snapshot (where autofill state has rotated) - // breaks because press alone no longer suffices. Forcing the LLM - // to explicitly fill makes both record and replay deterministic. + /* + * Disable Android Autofill globally. Without this, the framework + * silently populates editable fields (email, password, etc.) when + * they gain focus and a credential is cached on the AVD. That + * makes the LLM appear to "succeed" with just a press call — + * recorded cache misses the actual fill action, and replay on a + * different AVD snapshot (where autofill state has rotated) + * breaks because press alone no longer suffices. Forcing the LLM + * to explicitly fill makes both record and replay deterministic. + */ try { execFileSync( "adb", @@ -283,7 +295,7 @@ async function bootApp(): Promise { { timeout: 5_000, stdio: "ignore" }, ); } catch { - // best effort + /* best effort */ } log("boot: starting Metro"); @@ -319,13 +331,15 @@ async function bootApp(): Promise { }, ); - // Bounded wait for the SignIn UI to hydrate. The LLM can technically - // poll for it itself in step 1, but on slow runners that would burn - // LLM budget on what's effectively boot-blocking emulator wait time. - // We dump a probe snapshot every 30s during the wait so post-mortem - // can see *what* the app was showing if the wait times out — the - // first run of this workflow had no such artifacts and the failure - // was undebuggable from the upload. + /* + * Bounded wait for the SignIn UI to hydrate. The LLM can technically + * poll for it itself in step 1, but on slow runners that would burn + * LLM budget on what's effectively boot-blocking emulator wait time. + * We dump a probe snapshot every 30s during the wait so post-mortem + * can see *what* the app was showing if the wait times out — the + * first run of this workflow had no such artifacts and the failure + * was undebuggable from the upload. + */ log("boot: waiting for SignIn UI"); const start = Date.now(); let probeIdx = 0; @@ -335,9 +349,11 @@ async function bootApp(): Promise { try { snap = adCli.snapshot(); } catch (e) { - // Don't let a single transient snapshot timeout kill the wait — - // the emulator may be under heavy load and the next poll will - // probably succeed. + /* + * Don't let a single transient snapshot timeout kill the wait — + * the emulator may be under heavy load and the next poll will + * probably succeed. + */ log( `boot: snapshot threw (${(e as Error).message.slice(0, 80)}); retrying`, ); @@ -352,15 +368,17 @@ async function bootApp(): Promise { ); return; } - // ANR-recovery: when the runner is memory-pressured the system - // shows a "Pixel Launcher isn't responding" dialog over our app. - // Press "Wait" to dismiss, then force-stop + relaunch via - // agent-device. Plain `am start` was insufficient: if the ANR - // hit during JS bundle delivery, MainActivity was in a half- - // initialised state and `am start` just brought that broken - // activity to the foreground (run 25560886459 stuck on splash - // for 600s after recovering from an ANR via am start). Force- - // stop guarantees a clean process spawn for the next launch. + /* + * ANR-recovery: when the runner is memory-pressured the system + * shows a "Pixel Launcher isn't responding" dialog over our app. + * Press "Wait" to dismiss, then force-stop + relaunch via + * agent-device. Plain `am start` was insufficient: if the ANR + * hit during JS bundle delivery, MainActivity was in a half- + * initialised state and `am start` just brought that broken + * activity to the foreground (run 25560886459 stuck on splash + * for 600s after recovering from an ANR via am start). Force- + * stop guarantees a clean process spawn for the next launch. + */ if (isAnrDialog(snap)) { log("boot: ANR dialog detected — dismissing and force-relaunching app"); try { @@ -403,7 +421,7 @@ async function bootApp(): Promise { } catch (e) { log(`boot: relaunch failed: ${(e as Error).message.slice(0, 80)}`); } - // Give the process a moment to come back up before re-snapshotting. + /* Give the process a moment to come back up before re-snapshotting. */ await sleep(3_000); continue; } @@ -421,8 +439,10 @@ async function bootApp(): Promise { } await sleep(6_000); } - // Capture as much state as we can BEFORE failing so a re-run isn't - // required to debug. The cleanup trap will still write logcat after. + /* + * Capture as much state as we can BEFORE failing so a re-run isn't + * required to debug. The cleanup trap will still write logcat after. + */ try { const snap = adCli.snapshot(); fs.writeFileSync( @@ -474,7 +494,7 @@ async function waitForMetro(): Promise { return; } } catch { - // Metro not up yet + /* Metro not up yet */ } await sleep(2_000); } @@ -483,7 +503,7 @@ async function waitForMetro(): Promise { ); } -// ---- per-step orchestration ------------------------------------------- +/* ---- per-step orchestration ------------------------------------------- */ type StepCtx = { committed: CacheV1; @@ -565,13 +585,15 @@ async function executeStep( } ctx.stats.onBashRun(); actions = bashResult.actions; - // Settle gap: agent-device fill returns once it has dispatched - // the typing command, but the on-device EditText needs a beat for - // React Native's onChange to fire and the accessibility tree to - // re-publish the new text. Without this, verifyPostState below - // takes a snapshot before the typed text has propagated and the - // expect predicate fails on what's transient lag, not a real - // problem. + /* + * Settle gap: agent-device fill returns once it has dispatched + * the typing command, but the on-device EditText needs a beat for + * React Native's onChange to fire and the accessibility tree to + * re-publish the new text. Without this, verifyPostState below + * takes a snapshot before the typed text has propagated and the + * expect predicate fails on what's transient lag, not a real + * problem. + */ await sleep(500); } @@ -611,11 +633,13 @@ async function verifyPostState( const snap = adCli.snapshot(); const app = adCli.appstate(); - // Expect (when declared) is the source of truth: it's a deterministic - // predicate over the live UI, while the post-signature is a structural - // hash that can drift on cosmetic re-renders, animation timing, or - // node-ordering changes that don't affect what the user actually sees. - // If expect passes, the step succeeded — drift becomes advisory. + /* + * Expect (when declared) is the source of truth: it's a deterministic + * predicate over the live UI, while the post-signature is a structural + * hash that can drift on cosmetic re-renders, animation timing, or + * node-ordering changes that don't affect what the user actually sees. + * If expect passes, the step succeeded — drift becomes advisory. + */ if (step.expect) { const ev = evaluateExpect(step.expect, snap, app); if (!ev.ok) { @@ -629,8 +653,10 @@ async function verifyPostState( return { ok: true, snap }; } - // No expect declared — fall back to signature equality so a cache-hit - // path still has *some* post-state check. + /* + * No expect declared — fall back to signature equality so a cache-hit + * path still has *some* post-state check. + */ if (expectedSignature && snapshotSignature(snap) !== expectedSignature) { return { ok: false, @@ -640,7 +666,7 @@ async function verifyPostState( return { ok: true, snap }; } -// ---- cache replay ----------------------------------------------------- +/* ---- cache replay ----------------------------------------------------- */ async function replayCachedActions( actions: CachedAction[], @@ -650,9 +676,11 @@ async function replayCachedActions( if (!ok.ok) { return ok; } - // Tiny settle gap — even on warm runners, fill→press in - // immediate succession occasionally lands the press before - // React has propagated the fill. + /* + * Tiny settle gap — even on warm runners, fill→press in + * immediate succession occasionally lands the press before + * React has propagated the fill. + */ await sleep(150); } return { ok: true }; @@ -718,7 +746,7 @@ async function runWaitFor( }; } -// ---- LLM step --------------------------------------------------------- +/* ---- LLM step --------------------------------------------------------- */ const SYSTEM_PROMPT = [ "You are an autonomous mobile UI test runner driving the Expensify Android app via the agent-device CLI.", @@ -1008,13 +1036,15 @@ async function runLLMStep( messages.push({ role: "user", content: toolResults }); - // Refresh snap + appstate after every batch of tool calls that - // changed device state. Without this the LLM keeps seeing the - // pre-step snapshot even after its fill/press took effect, so - // identical fills get caught by the seen-hash dedup and the LLM - // burns its budget retrying actions it already performed. - // dispatchTool's snapshot/wait_for/back/dismiss callbacks already - // refresh; fill and press do not. + /* + * Refresh snap + appstate after every batch of tool calls that + * changed device state. Without this the LLM keeps seeing the + * pre-step snapshot even after its fill/press took effect, so + * identical fills get caught by the seen-hash dedup and the LLM + * burns its budget retrying actions it already performed. + * dispatchTool's snapshot/wait_for/back/dismiss callbacks already + * refresh; fill and press do not. + */ if ( toolUses.some( (tu) => tu.name === "fill" || tu.name === "press" || tu.name === "wait", @@ -1024,7 +1054,7 @@ async function runLLMStep( snap = adCli.snapshot(); app = adCli.appstate(); } catch (e) { - // Transient — next loop iteration will retry implicitly. + /* Transient — next loop iteration will retry implicitly. */ log( `runLLMStep: post-action snap refresh threw (${(e as Error).message.slice(0, 80)}); continuing with stale snap`, ); @@ -1118,7 +1148,7 @@ function describeExecutedAction(a: ExecutedAction): string { return a.tool; } -// ---- LLM tool dispatch ------------------------------------------------ +/* ---- LLM tool dispatch ------------------------------------------------ */ type DispatchCtx = { snap: Snapshot; @@ -1307,17 +1337,19 @@ function takeScreenshot(filename: string): string { return fs.readFileSync(p).toString("base64"); } -// ---- bash fallback ---------------------------------------------------- - -// Mirrors Phase 0's bash logic for the SignIn flow. Used when: -// - ANTHROPIC_API_KEY is missing -// - The Anthropic API exhausts retries with HTTP errors -// - The LLM gives up via step_failed (rare; mostly defensive) -// -// Only the SignIn-flow steps are covered. Adding a new test case -// without LLM access requires extending this map. That's intentional: -// the bash fallback is a safety net for known flows, not a generic -// drop-in for the LLM. +/* ---- bash fallback ---------------------------------------------------- */ + +/* + * Mirrors Phase 0's bash logic for the SignIn flow. Used when: + * - ANTHROPIC_API_KEY is missing + * - The Anthropic API exhausts retries with HTTP errors + * - The LLM gives up via step_failed (rare; mostly defensive) + * + * Only the SignIn-flow steps are covered. Adding a new test case + * without LLM access requires extending this map. That's intentional: + * the bash fallback is a safety net for known flows, not a generic + * drop-in for the LLM. + */ async function runBashFallback( step: Step, @@ -1327,7 +1359,7 @@ async function runBashFallback( const text = step.text.toLowerCase(); if (text.includes("wait") && text.includes("signin")) { - // Boot dance already gated on this; an instant pass is fine. + /* Boot dance already gated on this; an instant pass is fine. */ return { ok: true, actions: [] }; } @@ -1405,7 +1437,7 @@ async function runBashFallback( }; } -// ---- cleanup ---------------------------------------------------------- +/* ---- cleanup ---------------------------------------------------------- */ const backgroundPids: number[] = []; let cleanedUp = false; @@ -1437,14 +1469,14 @@ function registerCleanup(): void { }, ); } catch { - // best effort + /* best effort */ } adCli.closeSession(); for (const pid of backgroundPids) { try { process.kill(-pid, "SIGTERM"); } catch { - // already gone + /* already gone */ } } }; @@ -1459,7 +1491,7 @@ function registerCleanup(): void { }); } -// ---- helpers ---------------------------------------------------------- +/* ---- helpers ---------------------------------------------------------- */ function deriveCachePath(testCasePath: string): string { const base = path.basename(testCasePath, path.extname(testCasePath)); diff --git a/.github/scripts/agent-device-replay-cache.ts b/.github/scripts/agent-device-replay-cache.ts index bf33b24cf5aa..1f60599b3d53 100644 --- a/.github/scripts/agent-device-replay-cache.ts +++ b/.github/scripts/agent-device-replay-cache.ts @@ -1,15 +1,17 @@ -// Replay cache for the LLM-driven smoke. -// -// Without this cache, every PR run pays the LLM round-trip cost on -// every step. Worse, every run is non-deterministic. With it, the -// happy path costs ~$0 and runs deterministically; only when the -// snapshot signature changes (real UI shape change) do we fall -// through to the LLM. -// -// The cache file lives at `tests/smoke/cache/.json` and -// is committed. The diff in code review is the human-readable -// signal that "the SignIn UI shape changed" — the property -// reviewers want to see. +/* + * Replay cache for the LLM-driven smoke. + * + * Without this cache, every PR run pays the LLM round-trip cost on + * every step. Worse, every run is non-deterministic. With it, the + * happy path costs ~$0 and runs deterministically; only when the + * snapshot signature changes (real UI shape change) do we fall + * through to the LLM. + * + * The cache file lives at `tests/smoke/cache/.json` and + * is committed. The diff in code review is the human-readable + * signal that "the SignIn UI shape changed" — the property + * reviewers want to see. + */ import { createHash } from "crypto"; import fs from "fs"; diff --git a/.github/scripts/agent-device-snapshot-signature.ts b/.github/scripts/agent-device-snapshot-signature.ts index c61affd8d10f..d6681863e6e3 100644 --- a/.github/scripts/agent-device-snapshot-signature.ts +++ b/.github/scripts/agent-device-snapshot-signature.ts @@ -1,23 +1,25 @@ -// Structural signature of a UI snapshot. -// -// The signature is the cache key for the replay system: cache hits replay -// recorded actions, cache misses fall back to the LLM. For that to work, -// the signature must be: -// -// 1. STABLE across cosmetic UI changes — locale rotation, A/B copy -// tests, visible user data, dynamic timestamps. We exclude visible -// `text` content for this reason. A label changing from -// "Continue" to "Submit" must NOT bust the cache (the replay layer -// finds the button by role + position, then the LLM recovery layer -// handles a real shape change if any). -// -// 2. SENSITIVE to structural change — a new button appearing, an -// input becoming non-editable, a screen transitioning to a -// different layout. These are the events that invalidate a -// recorded action sequence. -// -// Net effect: localization or copy churn doesn't trigger an LLM call, -// but real UI shape change does. +/* + * Structural signature of a UI snapshot. + * + * The signature is the cache key for the replay system: cache hits replay + * recorded actions, cache misses fall back to the LLM. For that to work, + * the signature must be: + * + * 1. STABLE across cosmetic UI changes — locale rotation, A/B copy + * tests, visible user data, dynamic timestamps. We exclude visible + * `text` content for this reason. A label changing from + * "Continue" to "Submit" must NOT bust the cache (the replay layer + * finds the button by role + position, then the LLM recovery layer + * handles a real shape change if any). + * + * 2. SENSITIVE to structural change — a new button appearing, an + * input becoming non-editable, a screen transitioning to a + * different layout. These are the events that invalidate a + * recorded action sequence. + * + * Net effect: localization or copy churn doesn't trigger an LLM call, + * but real UI shape change does. + */ import { createHash } from "crypto"; import type { Snapshot, SnapshotNode } from "./agent-device-cli"; From e1d22289f5f976651d4d7310b7464523e151c225 Mon Sep 17 00:00:00 2001 From: Rustam Zeinalov Date: Thu, 14 May 2026 11:09:19 +0200 Subject: [PATCH 13/14] smoke(llm): extract Platform interface (Phase 2 prep, Android no-op) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pulls Android-specific boot dance, blocking-dialog recovery, keyevent dispatch, and log dumping out of the driver into a new agent-device-platform.ts module behind a small Platform interface. AndroidPlatform is a verbatim move of today's inlined logic. The fork-test smoke must produce byte-identical artifacts before and after this change — that's the success criterion for landing this PR. The cache and recorded actions reference the same locator shapes; only the call chain changed. Why now: Phase 2 (iOS Simulator smoke) needs an IOSPlatform sibling to AndroidPlatform. Doing the refactor in isolation keeps the iOS PR diff focused on the iOS-specific code. Files: - .github/scripts/agent-device-platform.ts (new, ~290 LOC) - .github/scripts/agent-device-llm-driver.ts (-185 LOC, delegates to platform.foo()) - .github/scripts/agent-device-cli.ts (adbKey marked @deprecated; retained for skill-bundled replay-only.ts compat) The driver auto-detects platform from PLATFORM env (defaults to 'android'). PLATFORM=ios is intentionally not implemented in this PR — throws with a clear "lands in PR B" message. Strict tsc clean across all modules. No new dependencies. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/scripts/agent-device-cli.ts | 12 +- .github/scripts/agent-device-llm-driver.ts | 216 +++---------- .github/scripts/agent-device-platform.ts | 351 +++++++++++++++++++++ 3 files changed, 394 insertions(+), 185 deletions(-) create mode 100644 .github/scripts/agent-device-platform.ts diff --git a/.github/scripts/agent-device-cli.ts b/.github/scripts/agent-device-cli.ts index d276227afbd5..c70c76b3938f 100644 --- a/.github/scripts/agent-device-cli.ts +++ b/.github/scripts/agent-device-cli.ts @@ -183,12 +183,14 @@ export function closeSession(): void { tryRun(["close", "--session", SESSION]); } +/** + * @deprecated Prefer `platform.back()` / `platform.dismissKeyboard()` + * from `./agent-device-platform`. Kept exported because the skill- + * bundled `replay-only.ts` helper (on a separate branch) still + * imports it; the upstream-bound driver no longer calls this + * directly — keyevent dispatch is now platform-specific. + */ export function adbKey(keyEvent: number): void { - /* - * Used by the LLM's `back()` and `dismiss_keyboard()` tools. We - * shell out to adb directly rather than agent-device because the - * CLI doesn't expose a keyevent primitive. - */ execFileSync("adb", ["shell", "input", "keyevent", String(keyEvent)], { timeout: CLI_TIMEOUT_MS, encoding: "utf8", diff --git a/.github/scripts/agent-device-llm-driver.ts b/.github/scripts/agent-device-llm-driver.ts index 54a445a939d5..be8598218f90 100644 --- a/.github/scripts/agent-device-llm-driver.ts +++ b/.github/scripts/agent-device-llm-driver.ts @@ -33,7 +33,7 @@ * bugs we don't want. */ -import { execFileSync, spawn } from "child_process"; +import { execFileSync } from "child_process"; import fs from "fs"; import path from "path"; import * as adCli from "./agent-device-cli"; @@ -55,19 +55,25 @@ import type { AnthropicTool, AnthropicMessage, } from "./agent-device-llm-client"; +import { + detectPlatform, + startMetro, + locateBundle, + backgroundPids, +} from "./agent-device-platform"; +import type { Platform } from "./agent-device-platform"; /* ---- config ----------------------------------------------------------- */ const MODEL = process.env.ANTHROPIC_MODEL ?? "claude-sonnet-4-6"; const TOKEN_BUDGET = Number(process.env.LLM_TOKEN_BUDGET ?? 200_000); -const APP_PACKAGE = process.env.APP_PACKAGE ?? "com.expensify.chat.dev"; const SESSION = process.env.AGENT_DEVICE_SESSION ?? "ci"; const ARTIFACTS_DIR = process.env.ARTIFACTS_DIR ?? "artifacts"; const TEST_CASE_PATH = process.argv[2] ?? "tests/smoke/android-signin.testcase.txt"; const CACHE_PATH = process.env.LLM_CACHE_PATH ?? deriveCachePath(TEST_CASE_PATH); -const APK_GLOB = "android/app/build/outputs/apk/development/debug"; +const platform: Platform = detectPlatform(); const METRO_READY_TIMEOUT_MS = 120_000; /* * 600s gives ~2× margin over Phase 0's observed 294s (warm AVD). The @@ -237,99 +243,32 @@ function parseTestCase(raw: string): Step[] { /* ---- boot dance (matches Phase 0's bash) ------------------------------ */ async function bootApp(): Promise { + log(`boot: platform=${platform.name}`); log("boot: closing stale session"); adCli.closeSession(); - log("boot: locating APK"); - const apkDir = APK_GLOB; - const files = fs.existsSync(apkDir) - ? fs.readdirSync(apkDir).filter((f) => f.endsWith(".apk")) - : []; - if (!files.length) { - fail(`no APK found under ${apkDir} — Rock build step likely failed`); - } - const apk = path.join(apkDir, files[0]); - log(`boot: installing ${apk}`); - execFileSync("adb", ["install", "-r", "-d", "-t", apk], { stdio: "inherit" }); - - log("boot: adb reverse 8081"); - execFileSync("adb", ["reverse", "tcp:8081", "tcp:8081"], { - stdio: "inherit", - }); - - /* - * Pre-emptive ANR suppression. On the 2-core ubuntu-latest runner - * the Pixel Launcher routinely ANRs under the combined load of - * Metro + APK launch + agent-device. The system normally shows a - * blocking "isn't responding" dialog that hides our app behind it. - * Setting hide_error_dialogs=1 makes the OS suppress those dialogs - * (the underlying ANR still happens but the foreground app keeps - * running uncovered). Best-effort — if the property doesn't exist - * on this Android version, fall through and let the in-loop - * recovery handle it. - */ - try { - execFileSync( - "adb", - ["shell", "settings", "put", "global", "hide_error_dialogs", "1"], - { timeout: 5_000, stdio: "ignore" }, + log("boot: locating app bundle"); + const bundle = locateBundle(platform); + if (!bundle) { + fail( + `no app bundle (*${platform.appBundleSuffix}) found under ${platform.appBundleDir} — build step likely failed`, ); - } catch { - /* best effort */ } + log(`boot: installing ${bundle}`); + platform.install(bundle); - /* - * Disable Android Autofill globally. Without this, the framework - * silently populates editable fields (email, password, etc.) when - * they gain focus and a credential is cached on the AVD. That - * makes the LLM appear to "succeed" with just a press call — - * recorded cache misses the actual fill action, and replay on a - * different AVD snapshot (where autofill state has rotated) - * breaks because press alone no longer suffices. Forcing the LLM - * to explicitly fill makes both record and replay deterministic. - */ - try { - execFileSync( - "adb", - ["shell", "settings", "put", "secure", "autofill_service", "null"], - { timeout: 5_000, stdio: "ignore" }, - ); - } catch { - /* best effort */ - } + log("boot: setupNetworking"); + platform.setupNetworking(); + + platform.preBootHardening(); log("boot: starting Metro"); - const metroLog = fs.openSync(path.join(ARTIFACTS_DIR, "metro.log"), "a"); - const metro = spawn("npm", ["start"], { - stdio: ["ignore", metroLog, metroLog], - detached: true, - }); - metro.unref(); - backgroundPids.push(metro.pid!); + startMetro(path.join(ARTIFACTS_DIR, "metro.log")); await waitForMetro(); log("boot: agent-device open --relaunch"); - const serial = execFileSync("adb", ["get-serialno"], { - encoding: "utf8", - }).trim(); - execFileSync( - "agent-device", - [ - "open", - APP_PACKAGE, - "--platform", - "android", - "--serial", - serial, - "--session", - SESSION, - "--relaunch", - ], - { - stdio: "inherit", - }, - ); + platform.launch(); /* * Bounded wait for the SignIn UI to hydrate. The LLM can technically @@ -369,58 +308,15 @@ async function bootApp(): Promise { return; } /* - * ANR-recovery: when the runner is memory-pressured the system - * shows a "Pixel Launcher isn't responding" dialog over our app. - * Press "Wait" to dismiss, then force-stop + relaunch via - * agent-device. Plain `am start` was insufficient: if the ANR - * hit during JS bundle delivery, MainActivity was in a half- - * initialised state and `am start` just brought that broken - * activity to the foreground (run 25560886459 stuck on splash - * for 600s after recovering from an ANR via am start). Force- - * stop guarantees a clean process spawn for the next launch. + * Blocking-dialog recovery. Platform-specific detection + + * dismissal hides behind `tryDismissBlockingDialog`. Android: + * Pixel Launcher ANR dialog (Close app / Wait). iOS (PR B): + * system permission alerts. Either way, dismissed → force- + * relaunch the app so we don't poll against a half-initialised + * activity stuck behind the dismissed dialog. */ - if (isAnrDialog(snap)) { - log("boot: ANR dialog detected — dismissing and force-relaunching app"); - try { - const waitBtn = snap.nodes.find( - (n) => n.kind === "button" && n.text?.toLowerCase() === "wait", - ); - if (waitBtn) { - adCli.press(waitBtn.ref); - } - } catch (e) { - log(`boot: dismiss press failed: ${(e as Error).message.slice(0, 80)}`); - } - try { - execFileSync("adb", ["shell", "am", "force-stop", APP_PACKAGE], { - timeout: 5_000, - stdio: "ignore", - }); - } catch (e) { - log(`boot: force-stop failed: ${(e as Error).message.slice(0, 80)}`); - } - try { - const serial = execFileSync("adb", ["get-serialno"], { - encoding: "utf8", - }).trim(); - execFileSync( - "agent-device", - [ - "open", - APP_PACKAGE, - "--platform", - "android", - "--serial", - serial, - "--session", - SESSION, - "--relaunch", - ], - { timeout: 30_000, stdio: "ignore" }, - ); - } catch (e) { - log(`boot: relaunch failed: ${(e as Error).message.slice(0, 80)}`); - } + if (platform.tryDismissBlockingDialog(snap)) { + log("boot: blocking dialog dismissed + app force-relaunched"); /* Give the process a moment to come back up before re-snapshotting. */ await sleep(3_000); continue; @@ -461,23 +357,6 @@ async function bootApp(): Promise { fail(`SignIn UI not ready within ${SIGNIN_LOAD_TIMEOUT_MS / 1000}s`); } -/** - * Detects the Android system "isn't responding" dialog. The exact - * label varies (Pixel Launcher / com.android.systemui / etc.) so we - * match on the structural fingerprint: exactly two button nodes - * labelled "Close app" and "Wait". - */ -function isAnrDialog(snap: { - nodes: Array<{ kind: string; text?: string }>; -}): boolean { - const buttons = snap.nodes.filter((n) => n.kind === "button"); - if (buttons.length !== 2) { - return false; - } - const labels = buttons.map((b) => b.text?.toLowerCase() ?? "").sort(); - return labels[0] === "close app" && labels[1] === "wait"; -} - async function waitForMetro(): Promise { const start = Date.now(); while (Date.now() - start < METRO_READY_TIMEOUT_MS) { @@ -697,11 +576,11 @@ async function dispatchCachedAction( return await runWaitFor(action.predicate, action.timeoutMs); } if (action.tool === "back") { - adCli.adbKey(4); + platform.back(); return { ok: true }; } if (action.tool === "dismiss_keyboard") { - adCli.adbKey(111); + platform.dismissKeyboard(); return { ok: true }; } const snap = adCli.snapshot(); @@ -1299,12 +1178,12 @@ async function dispatchTool( }; } case "back": - adCli.adbKey(4); + platform.back(); ctx.executed.push({ tool: "back" }); ctx.onSnap(adCli.snapshot()); return { content: "back pressed" }; case "dismiss_keyboard": - adCli.adbKey(111); + platform.dismissKeyboard(); ctx.executed.push({ tool: "dismiss_keyboard" }); ctx.onSnap(adCli.snapshot()); return { content: "keyboard dismissed" }; @@ -1439,7 +1318,6 @@ async function runBashFallback( /* ---- cleanup ---------------------------------------------------------- */ -const backgroundPids: number[] = []; let cleanedUp = false; function registerCleanup(): void { @@ -1448,29 +1326,7 @@ function registerCleanup(): void { return; } cleanedUp = true; - try { - execFileSync( - "adb", - [ - "logcat", - "-d", - "-v", - "time", - "*:W", - "ReactNativeJS:V", - "ReactNative:V", - ], - { - stdio: [ - "ignore", - fs.openSync(path.join(ARTIFACTS_DIR, "logcat.txt"), "w"), - "ignore", - ], - }, - ); - } catch { - /* best effort */ - } + platform.dumpLogsToFile(path.join(ARTIFACTS_DIR, "logcat.txt")); adCli.closeSession(); for (const pid of backgroundPids) { try { diff --git a/.github/scripts/agent-device-platform.ts b/.github/scripts/agent-device-platform.ts new file mode 100644 index 000000000000..2bf816683c87 --- /dev/null +++ b/.github/scripts/agent-device-platform.ts @@ -0,0 +1,351 @@ +/* + * Platform abstraction for the LLM-driven smoke driver. + * + * The per-step LLM loop, cache replay, expect predicate evaluator, + * signature hashing, and Anthropic client are platform-agnostic. + * Boot dance, blocking-dialog recovery, and a small set of keyevent + * tools are NOT. This module lifts the latter behind a tiny + * `Platform` interface so a new platform (iOS) can be added without + * touching the driver core. + * + * The current file ships ONE implementation — `AndroidPlatform` — + * which is a verbatim move of today's inlined logic in + * agent-device-llm-driver.ts. PR A is a refactor with zero behavior + * change; the matching Android fork-test run must produce the same + * artifacts as before. PR B (a follow-up) adds `IOSPlatform`. + */ + +import { execFileSync, spawn } from "child_process"; +import fs from "fs"; +import path from "path"; + +import * as adCli from "./agent-device-cli"; +import type { Snapshot } from "./agent-device-cli"; + +/* ---- shared types ---------------------------------------------------- */ + +export type PlatformName = "android" | "ios"; + +/** + * Operations the driver delegates to a Platform impl. Everything not + * listed here is shared across platforms and stays in the driver. + */ +export interface Platform { + readonly name: PlatformName; + + /** App bundle / package identifier passed to `agent-device open`. */ + readonly appPackage: string; + + /** + * Directory the runner searches for the installable bundle + * (APK on Android, .app on iOS). The first matching entry wins. + */ + readonly appBundleDir: string; + readonly appBundleSuffix: string; + + /** + * One-shot install of the located bundle. Throws on hard failure; + * the driver surfaces the error to the workflow log. + */ + install(bundlePath: string): void; + + /** + * Best-effort networking prep so Metro on the host is reachable + * from the device/sim. Android needs `adb reverse`; iOS Sim + * shares host loopback and this is a no-op. + */ + setupNetworking(): void; + + /** + * Best-effort pre-launch hardening — disable autofill, suppress + * system error dialogs, etc. Implementations should swallow + * failures (a missing setting on a fresh AVD is fine). + */ + preBootHardening(): void; + + /** + * Launch the app via `agent-device open --relaunch`. Handles the + * platform-specific `--platform`/`--serial`/`--device` flag set. + */ + launch(): void; + + /** + * Force a clean relaunch — used by blocking-dialog recovery. + * Android: `am force-stop` + relaunch. iOS: `xcrun simctl + * terminate` + relaunch. + */ + forceRelaunch(): void; + + /** + * Detect a system-modal "blocking" dialog over the app. Android's + * ANR dialog and iOS's permission alerts share the shape: a + * small handful of system buttons whose conservative choice + * lets the app continue. Returns true if dismissed. + */ + tryDismissBlockingDialog(snap: Snapshot): boolean; + + /** + * Map LLM-facing `back()` / `dismiss_keyboard()` tool calls to + * platform-specific keyevents. + */ + back(): void; + dismissKeyboard(): void; + + /** + * Dump device logs to the given file. Called by the driver's + * cleanup trap on exit. Best-effort — missing logs must not + * fail the run. + */ + dumpLogsToFile(outPath: string): void; +} + +/* ---- session constant shared by all platforms ------------------------ */ + +const SESSION = process.env.AGENT_DEVICE_SESSION ?? "ci"; + +/* ---- Android implementation ----------------------------------------- */ + +class AndroidPlatform implements Platform { + readonly name = "android" as const; + readonly appPackage = process.env.APP_PACKAGE ?? "com.expensify.chat.dev"; + readonly appBundleDir = "android/app/build/outputs/apk/development/debug"; + readonly appBundleSuffix = ".apk"; + + install(apkPath: string): void { + execFileSync("adb", ["install", "-r", "-d", "-t", apkPath], { + stdio: "inherit", + }); + } + + setupNetworking(): void { + execFileSync("adb", ["reverse", "tcp:8081", "tcp:8081"], { + stdio: "inherit", + }); + } + + preBootHardening(): void { + /* + * Suppress system ANR dialogs. Without this, the Pixel + * Launcher's "isn't responding" dialog covers our app on + * the 2-core ubuntu-latest runner during heavy boot load. + * The underlying ANR still happens but the foreground app + * keeps running uncovered. + */ + try { + execFileSync( + "adb", + ["shell", "settings", "put", "global", "hide_error_dialogs", "1"], + { timeout: 5_000, stdio: "ignore" }, + ); + } catch { + /* best effort */ + } + + /* + * Disable Android Autofill globally. Without this, the + * framework silently populates editable fields when they + * gain focus and a credential is cached on the AVD — + * cache recording then misses the fill action and replay + * breaks on a different AVD snapshot. + */ + try { + execFileSync( + "adb", + ["shell", "settings", "put", "secure", "autofill_service", "null"], + { timeout: 5_000, stdio: "ignore" }, + ); + } catch { + /* best effort */ + } + } + + launch(): void { + execFileSync( + "agent-device", + [ + "open", + this.appPackage, + "--platform", + "android", + "--serial", + this.getSerial(), + "--session", + SESSION, + "--relaunch", + ], + { stdio: "inherit" }, + ); + } + + forceRelaunch(): void { + try { + execFileSync("adb", ["shell", "am", "force-stop", this.appPackage], { + timeout: 5_000, + stdio: "ignore", + }); + } catch (e) { + // Surface to caller via log line; not fatal. + process.stdout.write( + `platform.android: force-stop failed: ${(e as Error).message.slice(0, 80)}\n`, + ); + } + try { + execFileSync( + "agent-device", + [ + "open", + this.appPackage, + "--platform", + "android", + "--serial", + this.getSerial(), + "--session", + SESSION, + "--relaunch", + ], + { timeout: 30_000, stdio: "ignore" }, + ); + } catch (e) { + process.stdout.write( + `platform.android: relaunch failed: ${(e as Error).message.slice(0, 80)}\n`, + ); + } + } + + tryDismissBlockingDialog(snap: Snapshot): boolean { + /* + * Android ANR dialog signature: exactly two buttons labelled + * "Close app" and "Wait". The label varies slightly + * (Pixel Launcher / com.android.systemui / etc.) but the + * structural fingerprint stays. + */ + const buttons = snap.nodes.filter((n) => n.kind === "button"); + if (buttons.length !== 2) { + return false; + } + const labels = buttons.map((b) => b.text?.toLowerCase() ?? "").sort(); + if (labels[0] !== "close app" || labels[1] !== "wait") { + return false; + } + try { + const waitBtn = snap.nodes.find( + (n) => n.kind === "button" && n.text?.toLowerCase() === "wait", + ); + if (waitBtn) { + adCli.press(waitBtn.ref); + } + } catch (e) { + process.stdout.write( + `platform.android: dismiss press failed: ${(e as Error).message.slice(0, 80)}\n`, + ); + } + this.forceRelaunch(); + return true; + } + + back(): void { + execFileSync("adb", ["shell", "input", "keyevent", "4"], { + timeout: 30_000, + encoding: "utf8", + }); + } + + dismissKeyboard(): void { + execFileSync("adb", ["shell", "input", "keyevent", "111"], { + timeout: 30_000, + encoding: "utf8", + }); + } + + dumpLogsToFile(outPath: string): void { + try { + execFileSync( + "adb", + [ + "logcat", + "-d", + "-v", + "time", + "*:W", + "ReactNativeJS:V", + "ReactNative:V", + ], + { + stdio: ["ignore", fs.openSync(outPath, "w"), "ignore"], + }, + ); + } catch { + /* best effort */ + } + } + + private getSerial(): string { + return execFileSync("adb", ["get-serialno"], { encoding: "utf8" }).trim(); + } +} + +/* ---- factory --------------------------------------------------------- */ + +/** + * Selects a Platform implementation. `PLATFORM` env var wins; defaults + * to 'android' for backwards compatibility with Phase 1. + */ +export function detectPlatform(): Platform { + const envName = (process.env.PLATFORM ?? "").toLowerCase().trim(); + if (envName === "ios") { + throw new Error( + "PLATFORM=ios requested but IOSPlatform is not implemented in this PR (Phase 2 PR A). It lands in PR B.", + ); + } + if (envName === "android" || envName === "") { + return new AndroidPlatform(); + } + throw new Error( + `unsupported PLATFORM='${envName}'; expected 'android' or 'ios'`, + ); +} + +/* ---- background process tracking ------------------------------------ */ + +/** + * Tracks PIDs the driver spawns (e.g. Metro) so the cleanup trap can + * terminate them on exit. Exported so the driver and the cleanup + * handler share state without circular imports. + */ +export const backgroundPids: number[] = []; + +/** + * Starts Metro and tracks its PID. Identical across platforms — both + * Android and iOS dev builds fetch the JS bundle from + * `http://localhost:8081/...`. + */ +export function startMetro(metroLogPath: string): void { + const metroLog = fs.openSync(metroLogPath, "a"); + const metro = spawn("npm", ["start"], { + stdio: ["ignore", metroLog, metroLog], + detached: true, + }); + metro.unref(); + if (metro.pid) { + backgroundPids.push(metro.pid); + } +} + +/** + * Resolve the installable bundle path under `platform.appBundleDir`. + * Returns the first match by name (sorted alphabetically), or null. + * The driver decides how to report a missing bundle. + */ +export function locateBundle(platform: Platform): string | null { + if (!fs.existsSync(platform.appBundleDir)) { + return null; + } + const files = fs + .readdirSync(platform.appBundleDir) + .filter((f) => f.endsWith(platform.appBundleSuffix)) + .sort(); + if (!files.length) { + return null; + } + return path.join(platform.appBundleDir, files[0]); +} From 4dbe15163de4fb28c95f55eb3a3411f07dc8dd27 Mon Sep 17 00:00:00 2001 From: rustam-callstack <192424666+rustam-callstack@users.noreply.github.com> Date: Wed, 27 May 2026 11:05:13 +0200 Subject: [PATCH 14/14] =?UTF-8?q?smoke(ci):=20bump=20agent-device=20pin=20?= =?UTF-8?q?0.14.7=20=E2=86=92=200.15.2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 0.15.x fixes the SnapshotInstrumentation race that caused per-iteration time to balloon from 90s → 80+min over a 10-run batch in 0.14.x. Local stability: 0.14.9 = 5/10 PASS with degradation; 0.15.2 = 9/10 PASS, flat 90s/iter. Co-Authored-By: Claude Opus 4.7 --- .github/workflows/smokeAndroidLLM.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/smokeAndroidLLM.yml b/.github/workflows/smokeAndroidLLM.yml index 4c32238f5dba..53ad6e947865 100644 --- a/.github/workflows/smokeAndroidLLM.yml +++ b/.github/workflows/smokeAndroidLLM.yml @@ -41,7 +41,7 @@ jobs: runs-on: blacksmith-4vcpu-ubuntu-2404 timeout-minutes: 35 env: - AGENT_DEVICE_VERSION: "0.14.7" + AGENT_DEVICE_VERSION: "0.15.2" # Hard kill-switch: total input+output tokens accumulated across # the run. Bounds runaway spend if a prompt or tool design # accidentally explodes context. ~$1 worst-case at sonnet 4.6