diff --git a/.github/scripts/agent-device-cli.ts b/.github/scripts/agent-device-cli.ts new file mode 100644 index 000000000000..c70c76b3938f --- /dev/null +++ b/.github/scripts/agent-device-cli.ts @@ -0,0 +1,207 @@ +/* + * Thin TypeScript wrapper around the `agent-device` CLI. + * + * Why this exists: the CLI emits accessibility-tree snapshots as + * human-readable text (`@e4 [text-field] "Phone or email," [editable]`). + * That format is fine for humans grepping artifacts but bad for an LLM + * because: + * 1. The LLM has to re-tokenize the structure on every turn — wasteful. + * 2. Subtle whitespace/quoting differences across platforms (Android's + * trailing comma vs iOS's no comma) leak into the LLM's reasoning. + * 3. Phantom hallucinated refs are harder to detect against free text. + * + * We parse once here, hand the LLM a typed JSON array, and keep the raw + * text in the artifact for post-mortem. + */ + +import { execFileSync } from "child_process"; + +/** + * One element in the parsed accessibility tree. The optional fields are + * absent when the underlying line lacked them; do NOT default to empty + * strings — the LLM uses presence/absence as a signal (e.g. a button with + * no text label is suspicious). + */ +export type SnapshotNode = { + ref: string; + kind: string; + text?: string; + editable: boolean; + enabled: boolean; + scrollable: boolean; +}; + +export type Snapshot = { + page?: string; + app?: string; + nodes: SnapshotNode[]; + nodeCount: number; + raw: string; +}; + +export type AppState = { + foregroundApp?: string; + activity?: string; + raw: string; +}; + +const SESSION = process.env.AGENT_DEVICE_SESSION ?? "ci"; + +/* + * Bound every CLI invocation so a hung emulator can't wedge the smoke. + * 30s is generous for read-only commands (snapshot/screenshot/appstate). + * `fill` is special: typing a 30-char string into an editable on a + * 2-core ubuntu-latest under load was observed to exceed 30s (the + * CLI partial-typed and exited non-zero on timeout — visible at the + * device level via screenshot but the runner threw before recording + * the action). 90s gives ~3x headroom. + */ +const CLI_TIMEOUT_MS = 30_000; +const CLI_FILL_TIMEOUT_MS = 90_000; + +function run(args: string[]): string { + const timeout = args[0] === "fill" ? CLI_FILL_TIMEOUT_MS : CLI_TIMEOUT_MS; + return execFileSync("agent-device", args, { + encoding: "utf8", + timeout, + maxBuffer: 8 * 1024 * 1024, + }); +} + +function tryRun(args: string[]): { + stdout: string; + ok: boolean; + error?: Error; +} { + try { + return { stdout: run(args), ok: true }; + } catch (e) { + return { stdout: "", ok: false, error: e as Error }; + } +} + +/** + * Parse a single snapshot line of the form: + * `@e4 [text-field] "Phone or email," [editable]` + * `@e5 [button] "Continue"` + * `@e2 [scroll-area] [scrollable]` + * + * The `agent-device` CLI's text format isn't a stable contract, so this + * parser is deliberately permissive: anything that doesn't fit the shape + * is dropped (and counted in nodeCount via the header line, not by + * counting parsed children — so we don't quietly hide drift). + */ +function parseNodeLine(line: string): SnapshotNode | null { + const refMatch = line.match(/^@(e\d+)\s+\[([a-z-]+)\]/); + if (!refMatch) { + return null; + } + const [, refIndex, kind] = refMatch; + const after = line.slice(refMatch[0].length).trim(); + + let text: string | undefined; + const textMatch = after.match(/^"((?:[^"\\]|\\.)*)"/); + if (textMatch) { + text = textMatch[1].replace(/,$/, ""); + } + + const flags = after.toLowerCase(); + return { + ref: `@${refIndex}`, + kind, + text, + editable: flags.includes("[editable]"), + enabled: !flags.includes("[disabled]"), + scrollable: flags.includes("[scrollable]"), + }; +} + +export function parseSnapshot(raw: string): Snapshot { + const lines = raw.split("\n"); + const nodes: SnapshotNode[] = []; + let page: string | undefined; + let app: string | undefined; + let nodeCount = 0; + + for (const line of lines) { + if (line.startsWith("Page:")) { + page = line.slice("Page:".length).trim(); + continue; + } + if (line.startsWith("App:")) { + app = line.slice("App:".length).trim(); + continue; + } + const countMatch = line.match(/^Snapshot:\s*(\d+)/); + if (countMatch) { + nodeCount = Number(countMatch[1]); + continue; + } + const node = parseNodeLine(line.trim()); + if (node) { + nodes.push(node); + } + } + return { page, app, nodes, nodeCount, raw }; +} + +export function parseAppState(raw: string): AppState { + const fg = raw.match(/Foreground app:\s*(\S+)/); + const act = raw.match(/Activity:\s*(\S+)/); + return { foregroundApp: fg?.[1], activity: act?.[1], raw }; +} + +/* ---- public surface used by the runner ------------------------------- */ + +export function snapshot(): Snapshot { + return parseSnapshot(run(["snapshot", "-i", "--session", SESSION])); +} + +export function screenshotBase64(path: string): string { + run(["screenshot", path, "--session", SESSION]); + /* + * The CLI writes to disk; the runner reads + base64-encodes itself + * (we keep this wrapper free of fs to keep the signatures simple). + */ + return path; +} + +export function appstate(): AppState { + return parseAppState(run(["appstate", "--session", SESSION])); +} + +export function fill(ref: string, text: string): void { + run(["fill", ref, text, "--session", SESSION]); +} + +export function press(ref: string): void { + run(["press", ref, "--session", SESSION]); +} + +export function closeSession(): void { + /* Idempotent — if there's no session, this is a no-op. */ + tryRun(["close", "--session", SESSION]); +} + +/** + * @deprecated Prefer `platform.back()` / `platform.dismissKeyboard()` + * from `./agent-device-platform`. Kept exported because the skill- + * bundled `replay-only.ts` helper (on a separate branch) still + * imports it; the upstream-bound driver no longer calls this + * directly — keyevent dispatch is now platform-specific. + */ +export function adbKey(keyEvent: number): void { + execFileSync("adb", ["shell", "input", "keyevent", String(keyEvent)], { + timeout: CLI_TIMEOUT_MS, + encoding: "utf8", + }); +} + +/** + * Find nodes whose text contains the given substring (case-insensitive). + * Side-effect-free; operates on a snapshot already in memory. + */ +export function findInSnapshot(snap: Snapshot, needle: string): SnapshotNode[] { + const n = needle.toLowerCase(); + return snap.nodes.filter((node) => node.text?.toLowerCase().includes(n)); +} diff --git a/.github/scripts/agent-device-expect.ts b/.github/scripts/agent-device-expect.ts new file mode 100644 index 000000000000..b50b8d97de86 --- /dev/null +++ b/.github/scripts/agent-device-expect.ts @@ -0,0 +1,87 @@ +/* + * `expect:` DSL — machine-checked postcondition for each test step. + * + * Why a tiny DSL instead of letting the LLM self-report success: + * `step_complete(rationale)` is an LLM claim, not evidence. A canary + * that trusts an LLM's claim is a canary the LLM can lie to. The + * `expect:` clause is evaluated by deterministic TypeScript code + * against the post-state snapshot/appstate. The step fails red if + * `expect:` fails, regardless of what the LLM said. + * + * Grammar (intentionally small — extend only when a real test step + * can't be expressed): + * snapshot.contains_text("...") + * snapshot.field_with_text("...").exists + * appstate.foreground == "..." + * + * String literal: double-quoted, backslash-escapable. No interpolation, + * no regex, no boolean ops. If a step needs more, write a second step. + */ + +import type { AppState, Snapshot } from "./agent-device-cli"; + +export type ExpectResult = { ok: true } | { ok: false; reason: string }; + +const STR = String.raw`"((?:[^"\\]|\\.)*)"`; + +const PATTERNS: Array<{ + re: RegExp; + eval: (m: RegExpMatchArray, snap: Snapshot, app: AppState) => ExpectResult; +}> = [ + { + re: new RegExp(`^snapshot\\.contains_text\\(${STR}\\)$`), + eval: (m, snap) => { + const needle = m[1].toLowerCase(); + const hit = snap.nodes.some((n) => + n.text?.toLowerCase().includes(needle), + ); + return hit + ? { ok: true } + : { + ok: false, + reason: `no node contains text ${JSON.stringify(m[1])} (snapshot has ${snap.nodes.length} nodes)`, + }; + }, + }, + { + re: new RegExp(`^snapshot\\.field_with_text\\(${STR}\\)\\.exists$`), + eval: (m, snap) => { + const needle = m[1].toLowerCase(); + const hit = snap.nodes.some( + (n) => n.editable && n.text?.toLowerCase().includes(needle), + ); + return hit + ? { ok: true } + : { + ok: false, + reason: `no editable field contains text ${JSON.stringify(m[1])}`, + }; + }, + }, + { + re: new RegExp(`^appstate\\.foreground\\s*==\\s*${STR}$`), + eval: (m, _snap, app) => { + return app.foregroundApp === m[1] + ? { ok: true } + : { + ok: false, + reason: `foreground app is ${app.foregroundApp ?? "(unknown)"}, expected ${m[1]}`, + }; + }, + }, +]; + +export function evaluateExpect( + clause: string, + snap: Snapshot, + app: AppState, +): ExpectResult { + const trimmed = clause.trim(); + for (const p of PATTERNS) { + const m = trimmed.match(p.re); + if (m) { + return p.eval(m, snap, app); + } + } + return { ok: false, reason: `unrecognized expect clause: ${clause}` }; +} diff --git a/.github/scripts/agent-device-llm-client.ts b/.github/scripts/agent-device-llm-client.ts new file mode 100644 index 000000000000..96d449d9b63f --- /dev/null +++ b/.github/scripts/agent-device-llm-client.ts @@ -0,0 +1,261 @@ +/* + * Thin client for the Anthropic /v1/messages endpoint. + * + * Decisions baked in: + * - Direct `fetch` instead of `@anthropic-ai/sdk` to avoid a new + * dependency on a CI-only path. Node 20 has fetch built in. + * - Prompt caching (`cache_control: {type: "ephemeral"}`) on the + * system message and the last tool definition. The system + tool + * surface is static across the run, so cache hit rate after step 1 + * is ~100%, cutting per-call cost by 5-10x. The 5-minute TTL fits + * a single CI run with margin. + * - Bounded exponential backoff with jitter for 429/500/502/503/529. + * The runner's caller decides what to do on final failure (typically + * fall back to a deterministic bash-style assertion); this client + * never silently degrades. + * - Token budget kill-switch: total input+output tokens accumulated + * across the run; throw if exceeded. Bounds runaway spend if a + * prompt or tool design accidentally explodes context. + */ + +export type AnthropicTool = { + name: string; + description: string; + input_schema: Record; + cache_control?: { type: "ephemeral" }; +}; + +export type AnthropicMessage = { + role: "user" | "assistant"; + content: Array< + | { type: "text"; text: string } + | { + type: "image"; + source: { type: "base64"; media_type: "image/png"; data: string }; + } + | { + type: "tool_use"; + id: string; + name: string; + input: Record; + } + | { + type: "tool_result"; + tool_use_id: string; + content: string; + is_error?: boolean; + } + >; +}; + +export type AnthropicResponse = { + id: string; + stop_reason: + | "end_turn" + | "tool_use" + | "max_tokens" + | "stop_sequence" + | string; + content: Array< + | { type: "text"; text: string } + | { + type: "tool_use"; + id: string; + name: string; + input: Record; + } + >; + usage: { + input_tokens: number; + output_tokens: number; + cache_read_input_tokens?: number; + cache_creation_input_tokens?: number; + }; +}; + +export type ClientOptions = { + apiKey: string; + model: string; + tokenBudget: number; + /** Prefix written to artifacts/llm-trace.jsonl for post-mortem. */ + traceWriter?: (entry: Record) => void; +}; + +const ANTHROPIC_VERSION = "2023-06-01"; +const RETRY_DELAYS_MS = [1_000, 3_000, 9_000]; +const RETRYABLE_STATUS = new Set([429, 500, 502, 503, 529]); + +export class TokenBudgetExceededError extends Error { + constructor(used: number, budget: number) { + super(`token budget exceeded: ${used} > ${budget}`); + } +} + +export class AnthropicCallFailedError extends Error { + constructor( + public readonly status: number, + public readonly body: string, + ) { + super(`Anthropic API failed with status ${status}: ${body.slice(0, 200)}`); + } +} + +export class AnthropicClient { + private tokensUsed = 0; + + constructor(private readonly opts: ClientOptions) {} + + getTokensUsed(): number { + return this.tokensUsed; + } + + async call(args: { + system: string; + tools: AnthropicTool[]; + messages: AnthropicMessage[]; + maxTokens?: number; + }): Promise { + /* + * Mark system + last tool as cacheable. Anthropic caches the + * contiguous prefix UP TO each `cache_control` marker, so two + * markers means "cache through end of system" and "cache + * through end of tools" as separate cached prefixes. + */ + const cachedTools = args.tools.map((t, i) => + i === args.tools.length - 1 + ? { ...t, cache_control: { type: "ephemeral" as const } } + : t, + ); + + const body = { + model: this.opts.model, + max_tokens: args.maxTokens ?? 1024, + temperature: 0, + system: [ + { + type: "text", + text: args.system, + cache_control: { type: "ephemeral" }, + }, + ], + tools: cachedTools, + messages: args.messages, + }; + + /* + * Verbose diagnostic mode: capture the full message thread + tool_use + * calls in the trace. Trade-off is artifact size and a small risk + * of leaking content the user typed; disabled unless DEBUG_LLM=1. + */ + const verbose = (process.env.DEBUG_LLM ?? "") === "1"; + if (verbose) { + const lastUser = args.messages + .slice() + .reverse() + .find((m) => m.role === "user"); + const lastText = lastUser?.content.find( + (c): c is { type: "text"; text: string } => c.type === "text", + ); + this.opts.traceWriter?.({ + type: "request", + message_count: args.messages.length, + last_user_text: lastText?.text.slice(0, 1500) ?? null, + tool_uses_in_thread: args.messages.flatMap((m) => + m.content + .filter( + ( + c, + ): c is { + type: "tool_use"; + id: string; + name: string; + input: Record; + } => c.type === "tool_use", + ) + .map((c) => ({ id: c.id, name: c.name, input: c.input })), + ), + }); + } + + let lastError: Error | undefined; + for (let attempt = 0; attempt <= RETRY_DELAYS_MS.length; attempt++) { + try { + const response = await this.callOnce(body); + this.accountForUsage(response.usage); + const baseEntry = { + type: "response", + attempt, + stop_reason: response.stop_reason, + usage: response.usage, + } as Record; + if (verbose) { + baseEntry.tool_uses = response.content + .filter((c) => c.type === "tool_use") + .map((c) => ({ + id: (c as { id: string }).id, + name: (c as { name: string }).name, + input: (c as { input: unknown }).input, + })); + baseEntry.text_preview = response.content + .filter((c) => c.type === "text") + .map((c) => (c as { text: string }).text.slice(0, 800)); + } + this.opts.traceWriter?.(baseEntry); + return response; + } catch (e) { + lastError = e as Error; + if (e instanceof TokenBudgetExceededError) { + throw e; + } + const retryable = + e instanceof AnthropicCallFailedError && + RETRYABLE_STATUS.has(e.status); + if (!retryable || attempt >= RETRY_DELAYS_MS.length) { + throw e; + } + const base = RETRY_DELAYS_MS[attempt]; + const jitter = base * 0.3 * (Math.random() * 2 - 1); + const wait = Math.max(0, Math.round(base + jitter)); + this.opts.traceWriter?.({ + type: "retry", + attempt, + status: (e as AnthropicCallFailedError).status, + waitMs: wait, + }); + await new Promise((r) => setTimeout(r, wait)); + } + } + throw lastError ?? new Error("unreachable"); + } + + private async callOnce(body: object): Promise { + const res = await fetch("https://api.anthropic.com/v1/messages", { + method: "POST", + headers: { + "content-type": "application/json", + "x-api-key": this.opts.apiKey, + "anthropic-version": ANTHROPIC_VERSION, + }, + body: JSON.stringify(body), + }); + if (!res.ok) { + throw new AnthropicCallFailedError(res.status, await res.text()); + } + return (await res.json()) as AnthropicResponse; + } + + private accountForUsage(usage: AnthropicResponse["usage"]): void { + /* + * Cache reads cost roughly 10% of normal input tokens, but for + * budget-protection purposes we count them at face value — + * budgets are about runaway prompt design, not pricing. + */ + this.tokensUsed += usage.input_tokens + usage.output_tokens; + if (this.tokensUsed > this.opts.tokenBudget) { + throw new TokenBudgetExceededError( + this.tokensUsed, + this.opts.tokenBudget, + ); + } + } +} diff --git a/.github/scripts/agent-device-llm-driver.ts b/.github/scripts/agent-device-llm-driver.ts new file mode 100644 index 000000000000..be8598218f90 --- /dev/null +++ b/.github/scripts/agent-device-llm-driver.ts @@ -0,0 +1,1375 @@ +/* + * Phase-1 LLM-driven Android smoke runner. + * + * Lifecycle inside the workflow's emulator-runner `script:` block: + * + * 1. Boot dance (deterministic, NOT LLM-driven): + * - close any stale agent-device session + * - locate dev APK from android/app/build/outputs/... + * - adb install + * - adb reverse tcp:8081 tcp:8081 (Metro reachable from emulator) + * - npm start & (Metro background) + * - poll /status until packager-status:running + * - agent-device open --relaunch (cold start) + * + * 2. Test-case execution: + * - parse test case (numbered steps + optional `expect:` lines) + * - per step: cache-first / LLM-fallback / bash-fallback ladder + * - assert post-state via `expect:` evaluator + * - write artifacts (screenshots, snapshots, llm-trace, cache-diff) + * + * 3. Cleanup (always — even on signal/error): + * - dump logcat once + * - close agent-device session (so re-runs aren't tripped by the + * "session already bound" guard) + * - kill background jobs (Metro) + * + * Why a TS runner instead of Python or Bash: + * - The repo already runs ts-node in CI (precedent: createDocsRoutes.ts). + * - Reusing the snapshot parser + signature + expect DSL across + * replay / LLM / bash paths means one source of truth for what + * "the SignIn screen is on screen" means — a divergence between + * "what bash sees" and "what the LLM sees" would be a class of + * bugs we don't want. + */ + +import { execFileSync } from "child_process"; +import fs from "fs"; +import path from "path"; +import * as adCli from "./agent-device-cli"; +import type { Snapshot, AppState } from "./agent-device-cli"; +import { + snapshotSignature, + refToLocator, + locatorToRef, +} from "./agent-device-snapshot-signature"; +import { evaluateExpect } from "./agent-device-expect"; +import * as cache from "./agent-device-replay-cache"; +import type { CachedAction, CacheV1 } from "./agent-device-replay-cache"; +import { + AnthropicClient, + TokenBudgetExceededError, + AnthropicCallFailedError, +} from "./agent-device-llm-client"; +import type { + AnthropicTool, + AnthropicMessage, +} from "./agent-device-llm-client"; +import { + detectPlatform, + startMetro, + locateBundle, + backgroundPids, +} from "./agent-device-platform"; +import type { Platform } from "./agent-device-platform"; + +/* ---- config ----------------------------------------------------------- */ + +const MODEL = process.env.ANTHROPIC_MODEL ?? "claude-sonnet-4-6"; +const TOKEN_BUDGET = Number(process.env.LLM_TOKEN_BUDGET ?? 200_000); +const SESSION = process.env.AGENT_DEVICE_SESSION ?? "ci"; +const ARTIFACTS_DIR = process.env.ARTIFACTS_DIR ?? "artifacts"; +const TEST_CASE_PATH = + process.argv[2] ?? "tests/smoke/android-signin.testcase.txt"; +const CACHE_PATH = + process.env.LLM_CACHE_PATH ?? deriveCachePath(TEST_CASE_PATH); +const platform: Platform = detectPlatform(); +const METRO_READY_TIMEOUT_MS = 120_000; +/* + * 600s gives ~2× margin over Phase 0's observed 294s (warm AVD). The + * first run on a fresh AVD-cache key is closer to a cold boot since + * the prime+run happens in two separate emulator-runner invocations + * and the snapshot-load overhead lands inside this budget. + */ +const SIGNIN_LOAD_TIMEOUT_MS = 600_000; +const BOOT_PROBE_INTERVAL_MS = 30_000; +const STEP_WALL_CLOCK_BUDGET_MS = 60_000; +const MAX_STATE_CHANGING_ACTIONS = 4; +const SCREENSHOT_BUDGET_PER_RUN = 2; +const TEXT_LENGTH_CAP = 200; +/* + * DEBUG_LLM=1 makes both the LLM client (request/response bodies) + * and the runner (per-tool-dispatch entries) emit verbose entries to + * llm-trace.jsonl + stdout. Off by default to keep normal-run + * artifacts and CI stdout slim. + */ +const DEBUG_LLM = process.env.DEBUG_LLM === "1"; + +/* ---- types ------------------------------------------------------------ */ + +type Step = { + number: number; + text: string; + expect: string | null; + raw: string; +}; + +type ToolResultBlock = { + type: "tool_result"; + tool_use_id: string; + content: string; + is_error?: boolean; +}; +type ContentBlock = + | { type: "text"; text: string } + | { + type: "image"; + source: { type: "base64"; media_type: "image/png"; data: string }; + } + | ToolResultBlock; + +type ExecutedAction = CachedAction & { ref?: string }; + +/* ---- entry point ------------------------------------------------------ */ + +async function main(): Promise { + fs.mkdirSync(ARTIFACTS_DIR, { recursive: true }); + registerCleanup(); + + log( + `runner=${MODEL} test_case=${TEST_CASE_PATH} cache=${CACHE_PATH} budget=${TOKEN_BUDGET}`, + ); + + const testCaseRaw = fs.readFileSync(TEST_CASE_PATH, "utf8"); + const testCaseHash = cache.hashText(testCaseRaw); + const steps = parseTestCase(testCaseRaw); + if (!steps.length) { + fail("test case has no steps"); + } + + const committed = cache.loadCache(CACHE_PATH, MODEL, testCaseHash); + const recorded: CacheV1 = { + version: 1, + model: MODEL, + testCaseHash, + steps: [], + }; + + await bootApp(); + + const apiKey = process.env.ANTHROPIC_API_KEY; + const llm = apiKey + ? new AnthropicClient({ + apiKey, + model: MODEL, + tokenBudget: TOKEN_BUDGET, + traceWriter: (e) => + fs.appendFileSync( + path.join(ARTIFACTS_DIR, "llm-trace.jsonl"), + `${JSON.stringify(e)}\n`, + ), + }) + : null; + if (!llm) { + log( + "::warning::ANTHROPIC_API_KEY missing — every step will use bash fallback", + ); + } + + let cacheHits = 0; + let llmRuns = 0; + let bashRuns = 0; + + for (const step of steps) { + const result = await executeStep(step, { + committed, + testCaseHash, + llm, + recorded, + stats: { + onCacheHit: () => cacheHits++, + onLLMRun: () => llmRuns++, + onBashRun: () => bashRuns++, + }, + }); + if (!result.ok) { + fail(`step ${step.number} failed: ${result.reason}`); + } + } + + /* + * Always write the recorded cache diff, even if it's identical. + * Reviewers want to see a clean (no-op) diff to know the canary + * ran end-to-end without UI drift. + */ + const diffText = cache.diff(committed, recorded); + fs.writeFileSync( + path.join(ARTIFACTS_DIR, "cache-diff.txt"), + `${diffText || "(no drift — cache up to date)"}\n`, + ); + cache.writeCache(path.join(ARTIFACTS_DIR, "cache-recorded.json"), recorded); + + log( + `::notice::smoke OK — cache_hits=${cacheHits} llm_runs=${llmRuns} bash_runs=${bashRuns} tokens=${llm?.getTokensUsed() ?? 0}`, + ); + + if (diffText) { + log( + "::warning::cache drift detected — copy artifacts/cache-recorded.json to tests/smoke/cache/.json and commit", + ); + } +} + +/* ---- test case parser ------------------------------------------------- */ + +function parseTestCase(raw: string): Step[] { + const steps: Step[] = []; + let cur: Step | null = null; + for (const lineRaw of raw.split("\n")) { + const line = lineRaw.trimEnd(); + if (!line.trim() || line.trim().startsWith("#")) { + continue; + } + const m = line.match(/^(\d+)\.\s+(.*)$/); + if (m) { + if (cur) { + steps.push(cur); + } + cur = { number: Number(m[1]), text: m[2], expect: null, raw: line }; + continue; + } + const ex = line.match(/^\s*expect:\s*(.+)$/); + if (ex && cur) { + cur.expect = ex[1]; + cur.raw += `\n${line}`; + } + } + if (cur) { + steps.push(cur); + } + return steps; +} + +/* ---- boot dance (matches Phase 0's bash) ------------------------------ */ + +async function bootApp(): Promise { + log(`boot: platform=${platform.name}`); + log("boot: closing stale session"); + adCli.closeSession(); + + log("boot: locating app bundle"); + const bundle = locateBundle(platform); + if (!bundle) { + fail( + `no app bundle (*${platform.appBundleSuffix}) found under ${platform.appBundleDir} — build step likely failed`, + ); + } + log(`boot: installing ${bundle}`); + platform.install(bundle); + + log("boot: setupNetworking"); + platform.setupNetworking(); + + platform.preBootHardening(); + + log("boot: starting Metro"); + startMetro(path.join(ARTIFACTS_DIR, "metro.log")); + + await waitForMetro(); + + log("boot: agent-device open --relaunch"); + platform.launch(); + + /* + * Bounded wait for the SignIn UI to hydrate. The LLM can technically + * poll for it itself in step 1, but on slow runners that would burn + * LLM budget on what's effectively boot-blocking emulator wait time. + * We dump a probe snapshot every 30s during the wait so post-mortem + * can see *what* the app was showing if the wait times out — the + * first run of this workflow had no such artifacts and the failure + * was undebuggable from the upload. + */ + log("boot: waiting for SignIn UI"); + const start = Date.now(); + let probeIdx = 0; + let lastProbeAt = 0; + while (Date.now() - start < SIGNIN_LOAD_TIMEOUT_MS) { + let snap; + try { + snap = adCli.snapshot(); + } catch (e) { + /* + * Don't let a single transient snapshot timeout kill the wait — + * the emulator may be under heavy load and the next poll will + * probably succeed. + */ + log( + `boot: snapshot threw (${(e as Error).message.slice(0, 80)}); retrying`, + ); + await sleep(2_000); + continue; + } + if ( + snap.nodes.some((n) => n.text?.toLowerCase().includes("phone or email")) + ) { + log( + `boot: SignIn ready after ${Math.round((Date.now() - start) / 1000)}s`, + ); + return; + } + /* + * Blocking-dialog recovery. Platform-specific detection + + * dismissal hides behind `tryDismissBlockingDialog`. Android: + * Pixel Launcher ANR dialog (Close app / Wait). iOS (PR B): + * system permission alerts. Either way, dismissed → force- + * relaunch the app so we don't poll against a half-initialised + * activity stuck behind the dismissed dialog. + */ + if (platform.tryDismissBlockingDialog(snap)) { + log("boot: blocking dialog dismissed + app force-relaunched"); + /* Give the process a moment to come back up before re-snapshotting. */ + await sleep(3_000); + continue; + } + if (Date.now() - lastProbeAt >= BOOT_PROBE_INTERVAL_MS) { + const elapsed = Math.round((Date.now() - start) / 1000); + fs.writeFileSync( + path.join( + ARTIFACTS_DIR, + `boot-probe-${String(probeIdx).padStart(2, "0")}-t${elapsed}s.txt`, + ), + snap.raw, + ); + probeIdx++; + lastProbeAt = Date.now(); + } + await sleep(6_000); + } + /* + * Capture as much state as we can BEFORE failing so a re-run isn't + * required to debug. The cleanup trap will still write logcat after. + */ + try { + const snap = adCli.snapshot(); + fs.writeFileSync( + path.join(ARTIFACTS_DIR, "boot-timeout-snapshot.txt"), + snap.raw, + ); + const app = adCli.appstate(); + fs.writeFileSync( + path.join(ARTIFACTS_DIR, "boot-timeout-appstate.txt"), + app.raw, + ); + adCli.screenshotBase64(path.join(ARTIFACTS_DIR, "boot-timeout.png")); + } catch (e) { + log(`boot: timeout-diagnostics capture failed: ${(e as Error).message}`); + } + fail(`SignIn UI not ready within ${SIGNIN_LOAD_TIMEOUT_MS / 1000}s`); +} + +async function waitForMetro(): Promise { + const start = Date.now(); + while (Date.now() - start < METRO_READY_TIMEOUT_MS) { + try { + const out = execFileSync( + "curl", + ["-sf", "http://localhost:8081/status"], + { encoding: "utf8" }, + ); + if (out.includes("packager-status:running")) { + log( + `boot: Metro ready after ${Math.round((Date.now() - start) / 1000)}s`, + ); + return; + } + } catch { + /* Metro not up yet */ + } + await sleep(2_000); + } + fail( + `Metro did not reach packager-status:running within ${METRO_READY_TIMEOUT_MS / 1000}s`, + ); +} + +/* ---- per-step orchestration ------------------------------------------- */ + +type StepCtx = { + committed: CacheV1; + testCaseHash: string; + llm: AnthropicClient | null; + recorded: CacheV1; + stats: { + onCacheHit: () => void; + onLLMRun: () => void; + onBashRun: () => void; + }; +}; + +async function executeStep( + step: Step, + ctx: StepCtx, +): Promise<{ ok: true } | { ok: false; reason: string }> { + const preSnap = adCli.snapshot(); + const preSig = snapshotSignature(preSnap); + const stepKey = `step ${step.number}`; + log(`::group::${stepKey} — ${step.text}`); + + fs.writeFileSync( + path.join(ARTIFACTS_DIR, `step-${step.number}-pre.txt`), + preSnap.raw, + ); + + const cached = cache.lookup(ctx.committed, step.number, preSig); + if (cached) { + log(`${stepKey}: cache hit (pre_sig=${preSig})`); + const replay = await replayCachedActions(cached.actions); + if (replay.ok) { + const post = await verifyPostState(step, cached.postSignature); + if (post.ok) { + ctx.stats.onCacheHit(); + ctx.recorded.steps.push(cached); + fs.writeFileSync( + path.join(ARTIFACTS_DIR, `step-${step.number}-post.txt`), + post.snap.raw, + ); + log(`::endgroup::`); + return { ok: true }; + } + log(`${stepKey}: cache drift — ${post.reason}; falling through to LLM`); + } else { + log( + `${stepKey}: replay failed — ${replay.reason}; falling through to LLM`, + ); + } + } + + let actions: ExecutedAction[] = []; + if (ctx.llm) { + try { + const llmResult = await runLLMStep(step, ctx.llm); + if (!llmResult.ok) { + log( + `${stepKey}: LLM gave up — ${llmResult.reason}; trying bash fallback`, + ); + } else { + ctx.stats.onLLMRun(); + actions = llmResult.actions; + } + } catch (e) { + if (e instanceof TokenBudgetExceededError) { + return { ok: false, reason: e.message }; + } + log( + `${stepKey}: LLM call failed (${(e as Error).message}); trying bash fallback`, + ); + } + } + + if (!actions.length) { + const bashResult = await runBashFallback(step); + if (!bashResult.ok) { + log(`::endgroup::`); + return { ok: false, reason: bashResult.reason }; + } + ctx.stats.onBashRun(); + actions = bashResult.actions; + /* + * Settle gap: agent-device fill returns once it has dispatched + * the typing command, but the on-device EditText needs a beat for + * React Native's onChange to fire and the accessibility tree to + * re-publish the new text. Without this, verifyPostState below + * takes a snapshot before the typed text has propagated and the + * expect predicate fails on what's transient lag, not a real + * problem. + */ + await sleep(500); + } + + const post = await verifyPostState(step, null); + if (!post.ok) { + log(`::endgroup::`); + return { ok: false, reason: post.reason }; + } + fs.writeFileSync( + path.join(ARTIFACTS_DIR, `step-${step.number}-post.txt`), + post.snap.raw, + ); + + ctx.recorded.steps.push({ + stepNumber: step.number, + stepTextHash: cache.hashText(step.text), + preSignature: preSig, + postSignature: snapshotSignature(post.snap), + actions: actions.map(stripExecutedRef), + expect: step.expect, + recordedAt: new Date().toISOString(), + runId: process.env.GITHUB_RUN_ID ?? "local", + }); + log(`::endgroup::`); + return { ok: true }; +} + +function stripExecutedRef(a: ExecutedAction): CachedAction { + const { ref, ...rest } = a as ExecutedAction & { ref?: string }; + return rest; +} + +async function verifyPostState( + step: Step, + expectedSignature: string | null, +): Promise<{ ok: true; snap: Snapshot } | { ok: false; reason: string }> { + const snap = adCli.snapshot(); + const app = adCli.appstate(); + + /* + * Expect (when declared) is the source of truth: it's a deterministic + * predicate over the live UI, while the post-signature is a structural + * hash that can drift on cosmetic re-renders, animation timing, or + * node-ordering changes that don't affect what the user actually sees. + * If expect passes, the step succeeded — drift becomes advisory. + */ + if (step.expect) { + const ev = evaluateExpect(step.expect, snap, app); + if (!ev.ok) { + return { ok: false, reason: `expect failed: ${ev.reason}` }; + } + if (expectedSignature && snapshotSignature(snap) !== expectedSignature) { + log( + `::warning::post-signature drift but expect passed (recorded ${expectedSignature}, observed ${snapshotSignature(snap)}) — accepting`, + ); + } + return { ok: true, snap }; + } + + /* + * No expect declared — fall back to signature equality so a cache-hit + * path still has *some* post-state check. + */ + if (expectedSignature && snapshotSignature(snap) !== expectedSignature) { + return { + ok: false, + reason: `post-state signature drift (recorded ${expectedSignature}, observed ${snapshotSignature(snap)})`, + }; + } + return { ok: true, snap }; +} + +/* ---- cache replay ----------------------------------------------------- */ + +async function replayCachedActions( + actions: CachedAction[], +): Promise<{ ok: true } | { ok: false; reason: string }> { + for (const action of actions) { + const ok = await dispatchCachedAction(action); + if (!ok.ok) { + return ok; + } + /* + * Tiny settle gap — even on warm runners, fill→press in + * immediate succession occasionally lands the press before + * React has propagated the fill. + */ + await sleep(150); + } + return { ok: true }; +} + +async function dispatchCachedAction( + action: CachedAction, +): Promise<{ ok: true } | { ok: false; reason: string }> { + if (action.tool === "wait") { + await sleep(action.ms); + return { ok: true }; + } + if (action.tool === "wait_for") { + return await runWaitFor(action.predicate, action.timeoutMs); + } + if (action.tool === "back") { + platform.back(); + return { ok: true }; + } + if (action.tool === "dismiss_keyboard") { + platform.dismissKeyboard(); + return { ok: true }; + } + const snap = adCli.snapshot(); + const ref = locatorToRef(snap, action.locator); + if (!ref) { + return { + ok: false, + reason: `cached locator did not resolve: ${JSON.stringify(action.locator)}`, + }; + } + if (action.tool === "fill") { + adCli.fill(ref, action.text); + return { ok: true }; + } + if (action.tool === "press") { + adCli.press(ref); + return { ok: true }; + } + return { + ok: false, + reason: `unknown cached tool: ${(action as { tool: string }).tool}`, + }; +} + +async function runWaitFor( + predicate: string, + timeoutMs: number, +): Promise<{ ok: true } | { ok: false; reason: string }> { + const start = Date.now(); + while (Date.now() - start < timeoutMs) { + const snap = adCli.snapshot(); + const app = adCli.appstate(); + const ev = evaluateExpect(predicate, snap, app); + if (ev.ok) { + return { ok: true }; + } + await sleep(250); + } + return { + ok: false, + reason: `wait_for timed out after ${timeoutMs}ms (predicate: ${predicate})`, + }; +} + +/* ---- LLM step --------------------------------------------------------- */ + +const SYSTEM_PROMPT = [ + "You are an autonomous mobile UI test runner driving the Expensify Android app via the agent-device CLI.", + "You receive: the current step description in plain English, an accessibility snapshot of the live UI, and a history of your tool calls within this step.", + "", + "Snapshot format: a JSON array of `{ref, kind, text, editable, enabled, scrollable}` nodes. Each ref is a stable handle for that node within this snapshot only — re-snapshot before reusing refs from a prior turn.", + "", + "Rules:", + "- Never invent a ref. Always pick refs from the most recent snapshot's `nodes` array.", + "- After any state-changing action (fill, press, back, dismiss_keyboard, wait), call snapshot to refresh before asserting.", + "- Use `assert` to prove a step succeeded — `step_complete` without an `assert` first is suspicious.", + "- Prefer `wait_for(predicate)` over `wait(ms)`. The bare wait is a last resort; the runner logs a warning each time it is used.", + "- Treat label text as advisory; it may be localized. Match by intent and element kind.", + "- If after 2-3 unique attempts you cannot make progress, call `step_failed` with a precise reason.", +].join("\n"); + +const TOOLS: AnthropicTool[] = [ + { + name: "snapshot", + description: + "Capture a fresh accessibility tree. Returns {nodes: [...], node_count: number}. Call this after any state-changing action and before using a ref from a previous turn.", + input_schema: { + type: "object", + properties: {}, + additionalProperties: false, + }, + }, + { + name: "screenshot", + description: + "Capture a PNG screenshot. Rate-limited to 2 calls per run; the runner may auto-attach a screenshot when a snapshot returns 0 nodes. Use this only when the snapshot is genuinely empty or when you've addressed phantom refs twice.", + input_schema: { + type: "object", + properties: {}, + additionalProperties: false, + }, + }, + { + name: "find", + description: + "Search the most recent snapshot for nodes whose `text` contains the given substring (case-insensitive). Side-effect-free.", + input_schema: { + type: "object", + properties: { needle: { type: "string" } }, + required: ["needle"], + additionalProperties: false, + }, + }, + { + name: "fill", + description: "Type text into the editable text-field at the given ref.", + input_schema: { + type: "object", + properties: { ref: { type: "string" }, text: { type: "string" } }, + required: ["ref", "text"], + additionalProperties: false, + }, + }, + { + name: "press", + description: "Tap the pressable element at the given ref.", + input_schema: { + type: "object", + properties: { ref: { type: "string" } }, + required: ["ref"], + additionalProperties: false, + }, + }, + { + name: "wait_for", + description: + 'Poll snapshots until `predicate` is satisfied or `timeout_ms` elapses. Predicates: snapshot.contains_text("..."), snapshot.field_with_text("...").exists, appstate.foreground == "...".', + input_schema: { + type: "object", + properties: { + predicate: { type: "string" }, + timeout_ms: { type: "integer", maximum: 10_000 }, + }, + required: ["predicate"], + additionalProperties: false, + }, + }, + { + name: "wait", + description: + "Sleep for the given number of milliseconds (max 2000). Last resort — prefer wait_for. The runner logs a warning each call.", + input_schema: { + type: "object", + properties: { ms: { type: "integer", minimum: 1, maximum: 2_000 } }, + required: ["ms"], + additionalProperties: false, + }, + }, + { + name: "back", + description: + "Press Android back. Use to recover from an unintended screen.", + input_schema: { + type: "object", + properties: {}, + additionalProperties: false, + }, + }, + { + name: "dismiss_keyboard", + description: "Dismiss the soft keyboard.", + input_schema: { + type: "object", + properties: {}, + additionalProperties: false, + }, + }, + { + name: "assert", + description: + "Verify a postcondition. Returns {ok: bool, reason?: string}. Predicates as in wait_for.", + input_schema: { + type: "object", + properties: { predicate: { type: "string" } }, + required: ["predicate"], + additionalProperties: false, + }, + }, + { + name: "appstate", + description: "Return {foreground_app, activity}.", + input_schema: { + type: "object", + properties: {}, + additionalProperties: false, + }, + }, + { + name: "step_complete", + description: + "Mark the current step as passed. Must include a brief rationale describing what was observed (mention the assert you ran).", + input_schema: { + type: "object", + properties: { rationale: { type: "string" } }, + required: ["rationale"], + additionalProperties: false, + }, + }, + { + name: "step_failed", + description: + "Mark the current step as failed. Use when 2-3 unique attempts have not produced progress, or when the screen does not match what the step expects.", + input_schema: { + type: "object", + properties: { reason: { type: "string" } }, + required: ["reason"], + additionalProperties: false, + }, + }, +]; + +async function runLLMStep( + step: Step, + llm: AnthropicClient, +): Promise< + { ok: true; actions: ExecutedAction[] } | { ok: false; reason: string } +> { + const startedAt = Date.now(); + let snap = adCli.snapshot(); + let app = adCli.appstate(); + let stateChanging = 0; + let phantomStreak = 0; + let attachScreenshotNext = false; + let screenshotsUsed = 0; + const seen = new Set(); + const messages: AnthropicMessage[] = []; + const executed: ExecutedAction[] = []; + + while ( + Date.now() - startedAt < STEP_WALL_CLOCK_BUDGET_MS && + stateChanging <= MAX_STATE_CHANGING_ACTIONS + ) { + if (snap.nodeCount === 0 && screenshotsUsed < SCREENSHOT_BUDGET_PER_RUN) { + attachScreenshotNext = true; + } + + const userBlocks: ContentBlock[] = []; + if (attachScreenshotNext && screenshotsUsed < SCREENSHOT_BUDGET_PER_RUN) { + const png = takeScreenshot( + `step-${step.number}-shot-${screenshotsUsed}.png`, + ); + screenshotsUsed++; + attachScreenshotNext = false; + userBlocks.push({ + type: "image", + source: { type: "base64", media_type: "image/png", data: png }, + }); + } + userBlocks.push({ + type: "text", + text: buildUserText(step, snap, app, executed), + }); + messages.push({ role: "user", content: userBlocks }); + + const response = await llm.call({ + system: SYSTEM_PROMPT, + tools: TOOLS, + messages, + }); + const assistantContent = response.content as AnthropicMessage["content"]; + messages.push({ role: "assistant", content: assistantContent }); + + const toolUses = assistantContent.filter( + ( + b, + ): b is Extract< + (typeof assistantContent)[number], + { type: "tool_use" } + > => b.type === "tool_use", + ); + if (!toolUses.length) { + return { ok: false, reason: "LLM returned no tool calls" }; + } + + const toolResults: ToolResultBlock[] = []; + for (const tu of toolUses) { + const sigKey = `${tu.name}:${JSON.stringify(tu.input)}:${snapshotSignature(snap)}`; + if (seen.has(sigKey)) { + toolResults.push({ + type: "tool_result", + tool_use_id: tu.id, + content: + "You already performed this exact action against this exact UI state and it produced no observable change. Try a different approach or call step_failed.", + is_error: true, + }); + continue; + } + seen.add(sigKey); + + try { + const out = await dispatchTool(tu.name, tu.input, { + snap, + app, + onSnap: (s) => { + snap = s; + }, + onApp: (a) => { + app = a; + }, + executed, + stepNumber: step.number, + onPhantom: () => { + phantomStreak++; + if ( + phantomStreak >= 2 && + screenshotsUsed < SCREENSHOT_BUDGET_PER_RUN + ) { + attachScreenshotNext = true; + } + }, + resetPhantom: () => { + phantomStreak = 0; + }, + }); + if (isStateChangingTool(tu.name)) { + stateChanging++; + } + if (out.terminal === "complete") { + return { ok: true, actions: executed }; + } + if (out.terminal === "failed") { + return { + ok: false, + reason: out.reason ?? "step_failed without reason", + }; + } + toolResults.push({ + type: "tool_result", + tool_use_id: tu.id, + content: out.content, + is_error: out.isError, + }); + } catch (e) { + toolResults.push({ + type: "tool_result", + tool_use_id: tu.id, + content: `tool error: ${(e as Error).message}`, + is_error: true, + }); + } + } + + messages.push({ role: "user", content: toolResults }); + + /* + * Refresh snap + appstate after every batch of tool calls that + * changed device state. Without this the LLM keeps seeing the + * pre-step snapshot even after its fill/press took effect, so + * identical fills get caught by the seen-hash dedup and the LLM + * burns its budget retrying actions it already performed. + * dispatchTool's snapshot/wait_for/back/dismiss callbacks already + * refresh; fill and press do not. + */ + if ( + toolUses.some( + (tu) => tu.name === "fill" || tu.name === "press" || tu.name === "wait", + ) + ) { + try { + snap = adCli.snapshot(); + app = adCli.appstate(); + } catch (e) { + /* Transient — next loop iteration will retry implicitly. */ + log( + `runLLMStep: post-action snap refresh threw (${(e as Error).message.slice(0, 80)}); continuing with stale snap`, + ); + } + } + } + + return { + ok: false, + reason: "wall-clock or distinct-action budget exhausted", + }; +} + +function isStateChangingTool(name: string): boolean { + return [ + "fill", + "press", + "back", + "dismiss_keyboard", + "wait", + "wait_for", + ].includes(name); +} + +function buildUserText( + step: Step, + snap: Snapshot, + app: AppState, + history: ExecutedAction[], +): string { + const lines: string[] = []; + lines.push(`Current step: ${step.number}. ${step.text}`); + if (step.expect) { + lines.push( + `Postcondition the runner will check (NOT for you to call directly): ${step.expect}`, + ); + } + if (history.length) { + const tail = history.slice(-3).map((h) => describeExecutedAction(h)); + lines.push(`Recent actions you took: ${tail.join("; ")}`); + } + lines.push( + `appstate.foreground=${app.foregroundApp ?? "(unknown)"} activity=${app.activity ?? "(unknown)"}`, + ); + lines.push(`snapshot.node_count=${snap.nodeCount}`); + lines.push("snapshot.nodes:"); + lines.push(JSON.stringify(snap.nodes.map(scrubNodeForPrompt), null, 0)); + return lines.join("\n"); +} + +function scrubNodeForPrompt( + n: Snapshot["nodes"][number], +): Record { + const text = n.text + ? sanitizeText(n.text).slice(0, TEXT_LENGTH_CAP) + : undefined; + return { + ref: n.ref, + kind: n.kind, + text, + editable: n.editable, + enabled: n.enabled, + scrollable: n.scrollable, + }; +} + +function sanitizeText(s: string): string { + let out = ""; + for (const ch of s) { + const c = ch.charCodeAt(0); + if (c >= 0x20 || c === 0x09 || c === 0x0a) { + out += ch; + } + } + return out; +} + +function describeExecutedAction(a: ExecutedAction): string { + if (a.tool === "fill") { + return `fill(${JSON.stringify(a.locator)}, "${a.text.slice(0, 30)}…")`; + } + if (a.tool === "press") { + return `press(${JSON.stringify(a.locator)})`; + } + if (a.tool === "wait_for") { + return `wait_for(${a.predicate}, ${a.timeoutMs}ms)`; + } + if (a.tool === "wait") { + return `wait(${a.ms}ms)`; + } + return a.tool; +} + +/* ---- LLM tool dispatch ------------------------------------------------ */ + +type DispatchCtx = { + snap: Snapshot; + app: AppState; + onSnap: (s: Snapshot) => void; + onApp: (a: AppState) => void; + executed: ExecutedAction[]; + stepNumber: number; + onPhantom: () => void; + resetPhantom: () => void; +}; + +type DispatchResult = { + content: string; + isError?: boolean; + terminal?: "complete" | "failed"; + reason?: string; +}; + +async function dispatchTool( + name: string, + input: Record, + ctx: DispatchCtx, +): Promise { + switch (name) { + case "snapshot": { + const s = adCli.snapshot(); + ctx.onSnap(s); + return { + content: JSON.stringify({ + node_count: s.nodeCount, + nodes: s.nodes.map(scrubNodeForPrompt), + }), + }; + } + case "screenshot": { + const file = `step-${ctx.stepNumber}-llm-shot.png`; + const data = takeScreenshot(file); + return { + content: `screenshot saved at ${file} (${data.length} bytes base64). Re-snapshot to keep working with refs.`, + }; + } + case "find": { + const needle = String(input.needle ?? ""); + const matches = adCli.findInSnapshot(ctx.snap, needle).map((n) => ({ + ref: n.ref, + kind: n.kind, + text: n.text, + editable: n.editable, + })); + return { content: JSON.stringify({ matches, count: matches.length }) }; + } + case "fill": { + const ref = String(input.ref ?? ""); + const text = String(input.text ?? ""); + const node = ctx.snap.nodes.find((n) => n.ref === ref); + if (!node) { + ctx.onPhantom(); + if (DEBUG_LLM) { + log( + `::debug::dispatch.fill phantom ref=${ref} text="${text.slice(0, 30)}…"`, + ); + } + return { + content: `phantom ref ${ref} not in current snapshot`, + isError: true, + }; + } + ctx.resetPhantom(); + try { + adCli.fill(ref, text); + } catch (e) { + if (DEBUG_LLM) { + log( + `::debug::dispatch.fill THREW ref=${ref} text="${text.slice(0, 30)}…" err=${(e as Error).message.slice(0, 100)}`, + ); + } + throw e; + } + const loc = refToLocator(ctx.snap, ref); + if (DEBUG_LLM) { + log( + `::debug::dispatch.fill ok ref=${ref} kind=${node.kind} loc=${JSON.stringify(loc)} text="${text.slice(0, 30)}…" executed_len_after=${ctx.executed.length + (loc ? 1 : 0)}`, + ); + } + if (loc) { + ctx.executed.push({ tool: "fill", locator: loc, text, ref }); + } + return { content: `filled ${ref}` }; + } + case "press": { + const ref = String(input.ref ?? ""); + const node = ctx.snap.nodes.find((n) => n.ref === ref); + if (!node) { + ctx.onPhantom(); + if (DEBUG_LLM) { + log(`::debug::dispatch.press phantom ref=${ref}`); + } + return { + content: `phantom ref ${ref} not in current snapshot`, + isError: true, + }; + } + ctx.resetPhantom(); + try { + adCli.press(ref); + } catch (e) { + if (DEBUG_LLM) { + log( + `::debug::dispatch.press THREW ref=${ref} err=${(e as Error).message.slice(0, 100)}`, + ); + } + throw e; + } + const loc = refToLocator(ctx.snap, ref); + if (DEBUG_LLM) { + log( + `::debug::dispatch.press ok ref=${ref} kind=${node.kind} loc=${JSON.stringify(loc)} executed_len_after=${ctx.executed.length + (loc ? 1 : 0)}`, + ); + } + if (loc) { + ctx.executed.push({ tool: "press", locator: loc, ref }); + } + return { content: `pressed ${ref}` }; + } + case "wait": { + const ms = Math.min(2_000, Math.max(1, Number(input.ms ?? 0))); + log(`::warning::LLM used wait(${ms}) — prefer wait_for`); + await sleep(ms); + ctx.executed.push({ tool: "wait", ms }); + return { content: `slept ${ms}ms` }; + } + case "wait_for": { + const predicate = String(input.predicate ?? ""); + const timeoutMs = Math.min( + 10_000, + Math.max(250, Number(input.timeout_ms ?? 5_000)), + ); + const r = await runWaitFor(predicate, timeoutMs); + ctx.executed.push({ tool: "wait_for", predicate, timeoutMs }); + ctx.onSnap(adCli.snapshot()); + ctx.onApp(adCli.appstate()); + return { + content: r.ok + ? "predicate satisfied" + : `wait_for timed out: ${r.reason}`, + isError: !r.ok, + }; + } + case "back": + platform.back(); + ctx.executed.push({ tool: "back" }); + ctx.onSnap(adCli.snapshot()); + return { content: "back pressed" }; + case "dismiss_keyboard": + platform.dismissKeyboard(); + ctx.executed.push({ tool: "dismiss_keyboard" }); + ctx.onSnap(adCli.snapshot()); + return { content: "keyboard dismissed" }; + case "assert": { + const predicate = String(input.predicate ?? ""); + const ev = evaluateExpect(predicate, ctx.snap, ctx.app); + return { content: JSON.stringify(ev), isError: !ev.ok }; + } + case "appstate": { + const a = adCli.appstate(); + ctx.onApp(a); + return { content: JSON.stringify(a) }; + } + case "step_complete": + return { content: "step accepted by runner", terminal: "complete" }; + case "step_failed": + return { + content: "step rejected by LLM", + terminal: "failed", + reason: String(input.reason ?? "no reason given"), + }; + default: + return { content: `unknown tool: ${name}`, isError: true }; + } +} + +function takeScreenshot(filename: string): string { + const p = path.join(ARTIFACTS_DIR, filename); + adCli.screenshotBase64(p); + return fs.readFileSync(p).toString("base64"); +} + +/* ---- bash fallback ---------------------------------------------------- */ + +/* + * Mirrors Phase 0's bash logic for the SignIn flow. Used when: + * - ANTHROPIC_API_KEY is missing + * - The Anthropic API exhausts retries with HTTP errors + * - The LLM gives up via step_failed (rare; mostly defensive) + * + * Only the SignIn-flow steps are covered. Adding a new test case + * without LLM access requires extending this map. That's intentional: + * the bash fallback is a safety net for known flows, not a generic + * drop-in for the LLM. + */ + +async function runBashFallback( + step: Step, +): Promise< + { ok: true; actions: ExecutedAction[] } | { ok: false; reason: string } +> { + const text = step.text.toLowerCase(); + + if (text.includes("wait") && text.includes("signin")) { + /* Boot dance already gated on this; an instant pass is fine. */ + return { ok: true, actions: [] }; + } + + if (text.includes("enter") && text.includes("email")) { + const m = step.text.match(/"([^"]+)"/); + if (!m) { + return { + ok: false, + reason: "bash fallback could not extract email from step text", + }; + } + const snap = adCli.snapshot(); + const field = snap.nodes.find( + (n) => + n.editable && + (n.kind === "text-field" || + (n.text?.toLowerCase().includes("phone") ?? false)), + ); + if (!field) { + return { + ok: false, + reason: "bash fallback: no editable text-field for email entry", + }; + } + adCli.fill(field.ref, m[1]); + const loc = refToLocator(snap, field.ref); + return { + ok: true, + actions: loc ? [{ tool: "fill", locator: loc, text: m[1] }] : [], + }; + } + + if (text.includes("press") && text.includes("continue")) { + const snap = adCli.snapshot(); + const btn = snap.nodes.find( + (n) => n.kind === "button" && n.text?.toLowerCase().includes("continue"), + ); + if (!btn) { + return { ok: false, reason: "bash fallback: no Continue button found" }; + } + adCli.press(btn.ref); + const loc = refToLocator(snap, btn.ref); + return { ok: true, actions: loc ? [{ tool: "press", locator: loc }] : [] }; + } + + if (text.includes("magic")) { + const start = Date.now(); + while (Date.now() - start < 60_000) { + const snap = adCli.snapshot(); + if ( + snap.nodes.some((n) => n.text?.toLowerCase().includes("magic code")) + ) { + return { + ok: true, + actions: [ + { + tool: "wait_for", + predicate: 'snapshot.contains_text("Magic code")', + timeoutMs: 60_000, + }, + ], + }; + } + await sleep(2_000); + } + return { + ok: false, + reason: "bash fallback: magic-code screen never appeared", + }; + } + + return { + ok: false, + reason: `bash fallback has no recipe for step text: ${step.text}`, + }; +} + +/* ---- cleanup ---------------------------------------------------------- */ + +let cleanedUp = false; + +function registerCleanup(): void { + const handler = (): void => { + if (cleanedUp) { + return; + } + cleanedUp = true; + platform.dumpLogsToFile(path.join(ARTIFACTS_DIR, "logcat.txt")); + adCli.closeSession(); + for (const pid of backgroundPids) { + try { + process.kill(-pid, "SIGTERM"); + } catch { + /* already gone */ + } + } + }; + process.on("exit", handler); + process.on("SIGINT", () => { + handler(); + process.exit(130); + }); + process.on("SIGTERM", () => { + handler(); + process.exit(143); + }); +} + +/* ---- helpers ---------------------------------------------------------- */ + +function deriveCachePath(testCasePath: string): string { + const base = path.basename(testCasePath, path.extname(testCasePath)); + return path.join("tests", "smoke", "cache", `${base}.json`); +} + +function log(msg: string): void { + process.stdout.write(`${msg}\n`); +} + +function sleep(ms: number): Promise { + return new Promise((r) => setTimeout(r, ms)); +} + +function fail(reason: string): never { + log(`::error::${reason}`); + process.exit(1); +} + +main().catch((e: unknown) => { + if (e instanceof AnthropicCallFailedError) { + fail(`anthropic API failed: ${e.status} ${e.body.slice(0, 200)}`); + } + fail(`runner crashed: ${(e as Error).stack ?? String(e)}`); +}); diff --git a/.github/scripts/agent-device-platform.ts b/.github/scripts/agent-device-platform.ts new file mode 100644 index 000000000000..2bf816683c87 --- /dev/null +++ b/.github/scripts/agent-device-platform.ts @@ -0,0 +1,351 @@ +/* + * Platform abstraction for the LLM-driven smoke driver. + * + * The per-step LLM loop, cache replay, expect predicate evaluator, + * signature hashing, and Anthropic client are platform-agnostic. + * Boot dance, blocking-dialog recovery, and a small set of keyevent + * tools are NOT. This module lifts the latter behind a tiny + * `Platform` interface so a new platform (iOS) can be added without + * touching the driver core. + * + * The current file ships ONE implementation — `AndroidPlatform` — + * which is a verbatim move of today's inlined logic in + * agent-device-llm-driver.ts. PR A is a refactor with zero behavior + * change; the matching Android fork-test run must produce the same + * artifacts as before. PR B (a follow-up) adds `IOSPlatform`. + */ + +import { execFileSync, spawn } from "child_process"; +import fs from "fs"; +import path from "path"; + +import * as adCli from "./agent-device-cli"; +import type { Snapshot } from "./agent-device-cli"; + +/* ---- shared types ---------------------------------------------------- */ + +export type PlatformName = "android" | "ios"; + +/** + * Operations the driver delegates to a Platform impl. Everything not + * listed here is shared across platforms and stays in the driver. + */ +export interface Platform { + readonly name: PlatformName; + + /** App bundle / package identifier passed to `agent-device open`. */ + readonly appPackage: string; + + /** + * Directory the runner searches for the installable bundle + * (APK on Android, .app on iOS). The first matching entry wins. + */ + readonly appBundleDir: string; + readonly appBundleSuffix: string; + + /** + * One-shot install of the located bundle. Throws on hard failure; + * the driver surfaces the error to the workflow log. + */ + install(bundlePath: string): void; + + /** + * Best-effort networking prep so Metro on the host is reachable + * from the device/sim. Android needs `adb reverse`; iOS Sim + * shares host loopback and this is a no-op. + */ + setupNetworking(): void; + + /** + * Best-effort pre-launch hardening — disable autofill, suppress + * system error dialogs, etc. Implementations should swallow + * failures (a missing setting on a fresh AVD is fine). + */ + preBootHardening(): void; + + /** + * Launch the app via `agent-device open --relaunch`. Handles the + * platform-specific `--platform`/`--serial`/`--device` flag set. + */ + launch(): void; + + /** + * Force a clean relaunch — used by blocking-dialog recovery. + * Android: `am force-stop` + relaunch. iOS: `xcrun simctl + * terminate` + relaunch. + */ + forceRelaunch(): void; + + /** + * Detect a system-modal "blocking" dialog over the app. Android's + * ANR dialog and iOS's permission alerts share the shape: a + * small handful of system buttons whose conservative choice + * lets the app continue. Returns true if dismissed. + */ + tryDismissBlockingDialog(snap: Snapshot): boolean; + + /** + * Map LLM-facing `back()` / `dismiss_keyboard()` tool calls to + * platform-specific keyevents. + */ + back(): void; + dismissKeyboard(): void; + + /** + * Dump device logs to the given file. Called by the driver's + * cleanup trap on exit. Best-effort — missing logs must not + * fail the run. + */ + dumpLogsToFile(outPath: string): void; +} + +/* ---- session constant shared by all platforms ------------------------ */ + +const SESSION = process.env.AGENT_DEVICE_SESSION ?? "ci"; + +/* ---- Android implementation ----------------------------------------- */ + +class AndroidPlatform implements Platform { + readonly name = "android" as const; + readonly appPackage = process.env.APP_PACKAGE ?? "com.expensify.chat.dev"; + readonly appBundleDir = "android/app/build/outputs/apk/development/debug"; + readonly appBundleSuffix = ".apk"; + + install(apkPath: string): void { + execFileSync("adb", ["install", "-r", "-d", "-t", apkPath], { + stdio: "inherit", + }); + } + + setupNetworking(): void { + execFileSync("adb", ["reverse", "tcp:8081", "tcp:8081"], { + stdio: "inherit", + }); + } + + preBootHardening(): void { + /* + * Suppress system ANR dialogs. Without this, the Pixel + * Launcher's "isn't responding" dialog covers our app on + * the 2-core ubuntu-latest runner during heavy boot load. + * The underlying ANR still happens but the foreground app + * keeps running uncovered. + */ + try { + execFileSync( + "adb", + ["shell", "settings", "put", "global", "hide_error_dialogs", "1"], + { timeout: 5_000, stdio: "ignore" }, + ); + } catch { + /* best effort */ + } + + /* + * Disable Android Autofill globally. Without this, the + * framework silently populates editable fields when they + * gain focus and a credential is cached on the AVD — + * cache recording then misses the fill action and replay + * breaks on a different AVD snapshot. + */ + try { + execFileSync( + "adb", + ["shell", "settings", "put", "secure", "autofill_service", "null"], + { timeout: 5_000, stdio: "ignore" }, + ); + } catch { + /* best effort */ + } + } + + launch(): void { + execFileSync( + "agent-device", + [ + "open", + this.appPackage, + "--platform", + "android", + "--serial", + this.getSerial(), + "--session", + SESSION, + "--relaunch", + ], + { stdio: "inherit" }, + ); + } + + forceRelaunch(): void { + try { + execFileSync("adb", ["shell", "am", "force-stop", this.appPackage], { + timeout: 5_000, + stdio: "ignore", + }); + } catch (e) { + // Surface to caller via log line; not fatal. + process.stdout.write( + `platform.android: force-stop failed: ${(e as Error).message.slice(0, 80)}\n`, + ); + } + try { + execFileSync( + "agent-device", + [ + "open", + this.appPackage, + "--platform", + "android", + "--serial", + this.getSerial(), + "--session", + SESSION, + "--relaunch", + ], + { timeout: 30_000, stdio: "ignore" }, + ); + } catch (e) { + process.stdout.write( + `platform.android: relaunch failed: ${(e as Error).message.slice(0, 80)}\n`, + ); + } + } + + tryDismissBlockingDialog(snap: Snapshot): boolean { + /* + * Android ANR dialog signature: exactly two buttons labelled + * "Close app" and "Wait". The label varies slightly + * (Pixel Launcher / com.android.systemui / etc.) but the + * structural fingerprint stays. + */ + const buttons = snap.nodes.filter((n) => n.kind === "button"); + if (buttons.length !== 2) { + return false; + } + const labels = buttons.map((b) => b.text?.toLowerCase() ?? "").sort(); + if (labels[0] !== "close app" || labels[1] !== "wait") { + return false; + } + try { + const waitBtn = snap.nodes.find( + (n) => n.kind === "button" && n.text?.toLowerCase() === "wait", + ); + if (waitBtn) { + adCli.press(waitBtn.ref); + } + } catch (e) { + process.stdout.write( + `platform.android: dismiss press failed: ${(e as Error).message.slice(0, 80)}\n`, + ); + } + this.forceRelaunch(); + return true; + } + + back(): void { + execFileSync("adb", ["shell", "input", "keyevent", "4"], { + timeout: 30_000, + encoding: "utf8", + }); + } + + dismissKeyboard(): void { + execFileSync("adb", ["shell", "input", "keyevent", "111"], { + timeout: 30_000, + encoding: "utf8", + }); + } + + dumpLogsToFile(outPath: string): void { + try { + execFileSync( + "adb", + [ + "logcat", + "-d", + "-v", + "time", + "*:W", + "ReactNativeJS:V", + "ReactNative:V", + ], + { + stdio: ["ignore", fs.openSync(outPath, "w"), "ignore"], + }, + ); + } catch { + /* best effort */ + } + } + + private getSerial(): string { + return execFileSync("adb", ["get-serialno"], { encoding: "utf8" }).trim(); + } +} + +/* ---- factory --------------------------------------------------------- */ + +/** + * Selects a Platform implementation. `PLATFORM` env var wins; defaults + * to 'android' for backwards compatibility with Phase 1. + */ +export function detectPlatform(): Platform { + const envName = (process.env.PLATFORM ?? "").toLowerCase().trim(); + if (envName === "ios") { + throw new Error( + "PLATFORM=ios requested but IOSPlatform is not implemented in this PR (Phase 2 PR A). It lands in PR B.", + ); + } + if (envName === "android" || envName === "") { + return new AndroidPlatform(); + } + throw new Error( + `unsupported PLATFORM='${envName}'; expected 'android' or 'ios'`, + ); +} + +/* ---- background process tracking ------------------------------------ */ + +/** + * Tracks PIDs the driver spawns (e.g. Metro) so the cleanup trap can + * terminate them on exit. Exported so the driver and the cleanup + * handler share state without circular imports. + */ +export const backgroundPids: number[] = []; + +/** + * Starts Metro and tracks its PID. Identical across platforms — both + * Android and iOS dev builds fetch the JS bundle from + * `http://localhost:8081/...`. + */ +export function startMetro(metroLogPath: string): void { + const metroLog = fs.openSync(metroLogPath, "a"); + const metro = spawn("npm", ["start"], { + stdio: ["ignore", metroLog, metroLog], + detached: true, + }); + metro.unref(); + if (metro.pid) { + backgroundPids.push(metro.pid); + } +} + +/** + * Resolve the installable bundle path under `platform.appBundleDir`. + * Returns the first match by name (sorted alphabetically), or null. + * The driver decides how to report a missing bundle. + */ +export function locateBundle(platform: Platform): string | null { + if (!fs.existsSync(platform.appBundleDir)) { + return null; + } + const files = fs + .readdirSync(platform.appBundleDir) + .filter((f) => f.endsWith(platform.appBundleSuffix)) + .sort(); + if (!files.length) { + return null; + } + return path.join(platform.appBundleDir, files[0]); +} diff --git a/.github/scripts/agent-device-replay-cache.ts b/.github/scripts/agent-device-replay-cache.ts new file mode 100644 index 000000000000..1f60599b3d53 --- /dev/null +++ b/.github/scripts/agent-device-replay-cache.ts @@ -0,0 +1,122 @@ +/* + * Replay cache for the LLM-driven smoke. + * + * Without this cache, every PR run pays the LLM round-trip cost on + * every step. Worse, every run is non-deterministic. With it, the + * happy path costs ~$0 and runs deterministically; only when the + * snapshot signature changes (real UI shape change) do we fall + * through to the LLM. + * + * The cache file lives at `tests/smoke/cache/.json` and + * is committed. The diff in code review is the human-readable + * signal that "the SignIn UI shape changed" — the property + * reviewers want to see. + */ + +import { createHash } from "crypto"; +import fs from "fs"; +import path from "path"; +import type { RoleLocator } from "./agent-device-snapshot-signature"; + +export type CachedAction = + | { tool: "fill"; locator: RoleLocator; text: string } + | { tool: "press"; locator: RoleLocator } + | { tool: "back" } + | { tool: "dismiss_keyboard" } + | { tool: "wait"; ms: number } + | { tool: "wait_for"; predicate: string; timeoutMs: number }; + +export type CachedStep = { + stepNumber: number; + stepTextHash: string; + preSignature: string; + postSignature: string; + actions: CachedAction[]; + expect: string | null; + recordedAt: string; + runId: string; +}; + +export type CacheV1 = { + version: 1; + model: string; + testCaseHash: string; + steps: CachedStep[]; +}; + +export function hashText(s: string): string { + return createHash("sha256").update(s).digest("hex").slice(0, 16); +} + +export function loadCache( + filePath: string, + model: string, + testCaseHash: string, +): CacheV1 { + if (!fs.existsSync(filePath)) { + return { version: 1, model, testCaseHash, steps: [] }; + } + const raw = JSON.parse(fs.readFileSync(filePath, "utf8")) as CacheV1; + if (raw.version !== 1) { + throw new Error( + `Cache version mismatch at ${filePath}: expected 1, got ${raw.version}`, + ); + } + return raw; +} + +/** + * Cache hit requires three things to line up: + * 1. test_case_hash — the test file itself hasn't been edited + * 2. step_number — we're at the right step in the sequence + * 3. pre_signature — we're staring at the same UI shape we recorded + * + * If any drift, we fall through to the LLM and (on success) the + * runner emits a cache-diff to artifacts. The PR check fails red, + * forcing the contributor to commit the updated cache. + */ +export function lookup( + cache: CacheV1, + stepNumber: number, + preSignature: string, +): CachedStep | null { + return ( + cache.steps.find( + (s) => s.stepNumber === stepNumber && s.preSignature === preSignature, + ) ?? null + ); +} + +export function diff(committed: CacheV1, recorded: CacheV1): string { + const lines: string[] = []; + for (const s of recorded.steps) { + const prior = committed.steps.find((c) => c.stepNumber === s.stepNumber); + if (!prior) { + lines.push( + `+ step ${s.stepNumber}: NEW (pre=${s.preSignature}, post=${s.postSignature})`, + ); + continue; + } + if (prior.preSignature !== s.preSignature) { + lines.push( + `~ step ${s.stepNumber}: pre_signature ${prior.preSignature} → ${s.preSignature}`, + ); + } + if (prior.postSignature !== s.postSignature) { + lines.push( + `~ step ${s.stepNumber}: post_signature ${prior.postSignature} → ${s.postSignature}`, + ); + } + if (JSON.stringify(prior.actions) !== JSON.stringify(s.actions)) { + lines.push( + `~ step ${s.stepNumber}: actions changed (${prior.actions.length} → ${s.actions.length})`, + ); + } + } + return lines.join("\n"); +} + +export function writeCache(filePath: string, cache: CacheV1): void { + fs.mkdirSync(path.dirname(filePath), { recursive: true }); + fs.writeFileSync(filePath, `${JSON.stringify(cache, null, 2)}\n`); +} diff --git a/.github/scripts/agent-device-snapshot-signature.ts b/.github/scripts/agent-device-snapshot-signature.ts new file mode 100644 index 000000000000..d6681863e6e3 --- /dev/null +++ b/.github/scripts/agent-device-snapshot-signature.ts @@ -0,0 +1,122 @@ +/* + * Structural signature of a UI snapshot. + * + * The signature is the cache key for the replay system: cache hits replay + * recorded actions, cache misses fall back to the LLM. For that to work, + * the signature must be: + * + * 1. STABLE across cosmetic UI changes — locale rotation, A/B copy + * tests, visible user data, dynamic timestamps. We exclude visible + * `text` content for this reason. A label changing from + * "Continue" to "Submit" must NOT bust the cache (the replay layer + * finds the button by role + position, then the LLM recovery layer + * handles a real shape change if any). + * + * 2. SENSITIVE to structural change — a new button appearing, an + * input becoming non-editable, a screen transitioning to a + * different layout. These are the events that invalidate a + * recorded action sequence. + * + * Net effect: localization or copy churn doesn't trigger an LLM call, + * but real UI shape change does. + */ + +import { createHash } from "crypto"; +import type { Snapshot, SnapshotNode } from "./agent-device-cli"; + +function project(node: SnapshotNode): string { + return [ + node.kind, + node.text ? "T1" : "T0", + node.editable ? "E1" : "E0", + node.enabled ? "N1" : "N0", + node.scrollable ? "S1" : "S0", + ].join("|"); +} + +/** + * Transient nodes the signature must ignore. + * + * React Native dev-mode renders an inline "!, " bubble for + * runtime warnings (StrictMode, dev-only assertions, etc.). These + * appear and disappear between runs depending on bundler timing and + * warning suppression state — same screen, different node count. + * Runs 25659967543 and 25662443061 produced different signatures on + * an identical SignIn screen because one had 3 extra dev-warning + * nodes the other didn't, and cache replay never landed. + * + * These warnings are dev-only, never reach release builds, and never + * mean anything to a user — exactly the kind of cosmetic node the + * structural signature should disregard. + */ +function isTransientDevWarning(node: SnapshotNode): boolean { + if (!node.text) { + return false; + } + if (node.kind === "group" && node.text.startsWith("!, ")) { + return true; + } + if (node.kind === "text" && node.text === "!") { + return true; + } + if ( + node.kind === "text" && + node.text.startsWith("Open debugger to view warnings") + ) { + return true; + } + if ( + node.kind === "text" && + node.text.startsWith("The result of getSnapshot") + ) { + return true; + } + return false; +} + +export function snapshotSignature(snap: Snapshot): string { + const projected = snap.nodes + .filter((n) => !isTransientDevWarning(n)) + .map(project) + .join("\n"); + return createHash("sha256").update(projected).digest("hex").slice(0, 16); +} + +/** + * Locator that survives across runs even though `@eN` refs do not. + * The runner re-resolves to a concrete `@ref` against the live + * snapshot at replay time. + * + * Example: `{kind: "text-field", index: 0, editable: true}` → + * "the first editable text-field in the current snapshot". + */ +export type RoleLocator = { + kind: string; + index: number; + editable?: boolean; +}; + +export function refToLocator(snap: Snapshot, ref: string): RoleLocator | null { + const sameKind = snap.nodes.filter( + (n) => n.kind === snap.nodes.find((m) => m.ref === ref)?.kind, + ); + const idx = sameKind.findIndex((n) => n.ref === ref); + if (idx < 0) { + return null; + } + const node = sameKind[idx]; + return { kind: node.kind, index: idx, editable: node.editable || undefined }; +} + +export function locatorToRef(snap: Snapshot, loc: RoleLocator): string | null { + const matches = snap.nodes.filter((n) => { + if (n.kind !== loc.kind) { + return false; + } + if (loc.editable !== undefined && n.editable !== loc.editable) { + return false; + } + return true; + }); + return matches[loc.index]?.ref ?? null; +} diff --git a/.github/workflows/smokeAndroidLLM.yml b/.github/workflows/smokeAndroidLLM.yml new file mode 100644 index 000000000000..4c32238f5dba --- /dev/null +++ b/.github/workflows/smokeAndroidLLM.yml @@ -0,0 +1,156 @@ +name: Android Smoke (agent-device · Phase 1, LLM-driven) + +# Phase-1 build-health canary: same emulator + APK + boot dance as +# Phase 0, but the test steps are now plain English and an LLM driver +# (Claude Sonnet) figures out which agent-device calls to make. A +# committed replay cache at tests/smoke/cache/.json keeps the +# happy path deterministic and ~$0 in API spend; cache misses fall +# back to the LLM, and final-tier failures (API down, LLM gives up) +# fall back to a deterministic Phase-0-style bash recipe so an +# Anthropic outage doesn't fail the build. +# +# Initial rollout: continue-on-error: true so this is non-blocking +# while we compare reliability against Phase 0 over a 2-week window. +# Once flake rate <= Phase 0's, flip to required and retire Phase 0. + +on: + pull_request: + types: [opened, synchronize] + branches-ignore: [staging, production] + # Don't ignore tests/ or .github/ — Phase 1 fires on changes to + # the test cases, the runner scripts, and the workflow itself. + paths-ignore: + - docs/** + - help/** + - contributingGuides/** + - "**.md" + workflow_dispatch: + +concurrency: + group: smoke-android-llm-${{ github.head_ref || github.ref }} + cancel-in-progress: true + +jobs: + smoke: + name: Android emulator smoke (LLM-driven) + if: ${{ github.actor != 'OSBotify' }} + # Non-blocking during the rollout window. The recommendation in the + # Phase 1 plan is to flip this to false (or remove the line) after + # 2 weeks if Phase 1's flake rate <= Phase 0's. + continue-on-error: true + runs-on: blacksmith-4vcpu-ubuntu-2404 + timeout-minutes: 35 + env: + AGENT_DEVICE_VERSION: "0.14.7" + # Hard kill-switch: total input+output tokens accumulated across + # the run. Bounds runaway spend if a prompt or tool design + # accidentally explodes context. ~$1 worst-case at sonnet 4.6 + # rates without prompt cache; in practice with prompt cache the + # happy path uses 5-10x less. + LLM_TOKEN_BUDGET: "200000" + ANTHROPIC_MODEL: "claude-sonnet-4-6" + + steps: + - name: Checkout + # v6 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd + with: + submodules: true + token: ${{ secrets.OS_BOTIFY_TOKEN }} + + - name: Verify KVM / fix permissions if needed + run: | + if ! ls -la /dev/kvm 2>/dev/null; then + echo "::error::No /dev/kvm on this runner — emulator will fall back to TCG and the job will time out" + exit 1 + fi + if [ ! -w /dev/kvm ]; then + echo 'KERNEL=="kvm", GROUP="kvm", MODE="0666", OPTIONS+="static_node=kvm"' \ + | sudo tee /etc/udev/rules.d/99-kvm4all.rules + sudo udevadm control --reload-rules + sudo udevadm trigger --name-match=kvm + fi + + - name: Setup Java + uses: actions/setup-java@3a4f6e1af504cf6a31855fa899c6aa5355ba6c12 + with: + distribution: temurin + java-version: "17" + + - name: Setup Node + uses: ./.github/actions/composite/setupNode + with: + IS_HYBRID_BUILD: "false" + + - name: Install agent-device CLI + run: npm install -g "agent-device@${AGENT_DEVICE_VERSION}" + + - name: Configure AWS credentials (Rock S3 cache) + uses: aws-actions/configure-aws-credentials@8df5847569e6427dd6c4fb1cf565c83acfa8afa7 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: us-east-1 + + - name: Configure MapBox SDK + run: ./scripts/setup-mapbox-sdk.sh ${{ secrets.MAPBOX_SDK_DOWNLOAD_TOKEN }} + + - name: Install Android CMake 3.30.5 (Hermes pins this exact version) + run: | + yes | "$ANDROID_HOME/cmdline-tools/latest/bin/sdkmanager" --licenses > /dev/null 2>&1 || true + "$ANDROID_HOME/cmdline-tools/latest/bin/sdkmanager" --install "cmake;3.30.5" 2>&1 | tail -5 + + - name: Build / fetch developmentDebug APK via Rock + env: + STANDALONE_NEW_DOT: "true" + run: npx rock build:android --variant developmentDebug + + - name: AVD cache + uses: actions/cache@9255dc7a253b0ccc959486e2bca901246202afeb + id: avd-cache + with: + path: | + ~/.android/avd/* + ~/.android/adb* + ~/.android/adbkey + ~/.android/adbkey.pub + key: avd-pixel8-api35-x86_64-v1-${{ hashFiles('.github/workflows/smokeAndroidLLM.yml') }} + + - name: Prime AVD snapshot (cache miss only) + if: steps.avd-cache.outputs.cache-hit != 'true' + uses: reactivecircus/android-emulator-runner@v2 + with: + api-level: 35 + target: google_apis + arch: x86_64 + profile: pixel_8 + force-avd-creation: false + emulator-options: -no-window -gpu swiftshader_indirect -noaudio -no-boot-anim + disable-animations: false + script: | + adb wait-for-device + until [ -n "$(adb shell getprop sys.boot_completed | tr -d '\r')" ]; do sleep 2; done + echo "AVD primed" + + - name: Run smoke (LLM-driven) + uses: reactivecircus/android-emulator-runner@v2 + env: + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + with: + api-level: 35 + target: google_apis + arch: x86_64 + profile: pixel_8 + force-avd-creation: false + emulator-options: -no-snapshot-save -no-window -gpu swiftshader_indirect -noaudio -no-boot-anim + disable-animations: true + script: npm run smoke:android:llm + + - name: Upload artifacts + if: always() + uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f + with: + name: smoke-android-llm-${{ github.run_id }}-${{ github.run_attempt }} + path: artifacts/ + if-no-files-found: warn + retention-days: 14 diff --git a/package.json b/package.json index 3681ee193d4e..c526ddc80729 100644 --- a/package.json +++ b/package.json @@ -35,6 +35,7 @@ "createDocsRoutes": "ts-node .github/scripts/createDocsRoutes.ts", "generateAllowedUrls": "ts-node .github/scripts/generateAllowedUrls.ts", "detectRedirectCycle": "ts-node .github/scripts/detectRedirectCycle.ts", + "smoke:android:llm": "ts-node .github/scripts/agent-device-llm-driver.ts", "ios-build": "bundle exec fastlane ios build_unsigned", "ios-hybrid-build": "bundle exec fastlane ios build_unsigned_hybrid", "android-build": "bundle exec fastlane android build_local", diff --git a/tests/smoke/android-signin.testcase.txt b/tests/smoke/android-signin.testcase.txt new file mode 100644 index 000000000000..b0b9add4c232 --- /dev/null +++ b/tests/smoke/android-signin.testcase.txt @@ -0,0 +1,24 @@ +# Phase-1 LLM-driven Android smoke — SignIn flow. +# +# Each step is plain English the LLM reads to decide what UI actions to +# take. The optional `expect:` line is a machine-checked postcondition +# evaluated by the runner (NOT the LLM) after the step's tool calls +# complete; it is what gives the canary a hard pass/fail signal +# independent of the LLM's self-assessment. +# +# Expect predicates supported (see .github/scripts/agent-device-expect.ts): +# snapshot.contains_text("...") +# snapshot.field_with_text("...").exists +# appstate.foreground == "..." + +1. Wait for the app to fully load and the SignIn screen to appear. + expect: snapshot.contains_text("Phone or email") + +2. Enter "rustam.zeinalov@callstack.com" into the email/phone field. + expect: snapshot.field_with_text("rustam.zeinalov@callstack.com").exists + +3. Press the Continue button. + expect: appstate.foreground == "com.expensify.chat.dev" + +4. Wait for the magic-code screen to appear. + expect: snapshot.contains_text("Magic code") diff --git a/tests/smoke/cache/android-signin.testcase.json b/tests/smoke/cache/android-signin.testcase.json new file mode 100644 index 000000000000..4dffe68f81f7 --- /dev/null +++ b/tests/smoke/cache/android-signin.testcase.json @@ -0,0 +1,74 @@ +{ + "version": 1, + "model": "claude-sonnet-4-6", + "testCaseHash": "377c89ecd3182b95", + "steps": [ + { + "stepNumber": 1, + "stepTextHash": "eefe04289ed44849", + "preSignature": "bada22fec79afdc7", + "postSignature": "bada22fec79afdc7", + "actions": [], + "expect": "snapshot.contains_text(\"Phone or email\")", + "recordedAt": "2026-05-11T10:29:23.672Z", + "runId": "25659967543" + }, + { + "stepNumber": 2, + "stepTextHash": "988a4a6e077e6dc6", + "preSignature": "bada22fec79afdc7", + "postSignature": "04ba1966c1ae1c5f", + "actions": [ + { + "tool": "fill", + "locator": { + "kind": "text-field", + "index": 0, + "editable": true + }, + "text": "rustam.zeinalov@callstack.com" + } + ], + "expect": "snapshot.field_with_text(\"rustam.zeinalov@callstack.com\").exists", + "recordedAt": "2026-05-11T10:29:23.673Z", + "runId": "25659967543" + }, + { + "stepNumber": 3, + "stepTextHash": "a071b334a6b8c0f4", + "preSignature": "04ba1966c1ae1c5f", + "postSignature": "33d1e5d0787b275a", + "actions": [ + { + "tool": "press", + "locator": { + "kind": "button", + "index": 0 + } + }, + { + "tool": "dismiss_keyboard" + } + ], + "expect": "appstate.foreground == \"com.expensify.chat.dev\"", + "recordedAt": "2026-05-11T10:29:23.673Z", + "runId": "25659967543" + }, + { + "stepNumber": 4, + "stepTextHash": "a1059919be2f42c9", + "preSignature": "33d1e5d0787b275a", + "postSignature": "33d1e5d0787b275a", + "actions": [ + { + "tool": "wait_for", + "predicate": "snapshot.contains_text(\"Magic code\")", + "timeoutMs": 60000 + } + ], + "expect": "snapshot.contains_text(\"Magic code\")", + "recordedAt": "2026-05-11T10:29:23.673Z", + "runId": "25659967543" + } + ] +}