diff --git a/.github/scripts/agent-device-cli.ts b/.github/scripts/agent-device-cli.ts
new file mode 100644
index 000000000000..c70c76b3938f
--- /dev/null
+++ b/.github/scripts/agent-device-cli.ts
@@ -0,0 +1,207 @@
+/*
+ * Thin TypeScript wrapper around the `agent-device` CLI.
+ *
+ * Why this exists: the CLI emits accessibility-tree snapshots as
+ * human-readable text (`@e4 [text-field] "Phone or email," [editable]`).
+ * That format is fine for humans grepping artifacts but bad for an LLM
+ * because:
+ *   1. The LLM has to re-tokenize the structure on every turn — wasteful.
+ *   2. Subtle whitespace/quoting differences across platforms (Android's
+ *      trailing comma vs iOS's no comma) leak into the LLM's reasoning.
+ *   3. Phantom hallucinated refs are harder to detect against free text.
+ *
+ * We parse once here, hand the LLM a typed JSON array, and keep the raw
+ * text in the artifact for post-mortem.
+ */
+
+import { execFileSync } from "child_process";
+
+/**
+ * One element in the parsed accessibility tree. The optional fields are
+ * absent when the underlying line lacked them; do NOT default to empty
+ * strings — the LLM uses presence/absence as a signal (e.g. a button with
+ * no text label is suspicious).
+ */
+export type SnapshotNode = {
+  ref: string;
+  kind: string;
+  text?: string;
+  editable: boolean;
+  enabled: boolean;
+  scrollable: boolean;
+};
+
+export type Snapshot = {
+  page?: string;
+  app?: string;
+  nodes: SnapshotNode[];
+  nodeCount: number;
+  raw: string;
+};
+
+export type AppState = {
+  foregroundApp?: string;
+  activity?: string;
+  raw: string;
+};
+
+const SESSION = process.env.AGENT_DEVICE_SESSION ?? "ci";
+
+/*
+ * Bound every CLI invocation so a hung emulator can't wedge the smoke.
+ * 30s is generous for read-only commands (snapshot/screenshot/appstate).
+ * `fill` is special: typing a 30-char string into an editable on a
+ * 2-core ubuntu-latest under load was observed to exceed 30s (the
+ * CLI partial-typed and exited non-zero on timeout — visible at the
+ * device level via screenshot but the runner threw before recording
+ * the action). 90s gives ~3x headroom.
+ */
+const CLI_TIMEOUT_MS = 30_000;
+const CLI_FILL_TIMEOUT_MS = 90_000;
+
+function run(args: string[]): string {
+  const timeout = args[0] === "fill" ? CLI_FILL_TIMEOUT_MS : CLI_TIMEOUT_MS;
+  return execFileSync("agent-device", args, {
+    encoding: "utf8",
+    timeout,
+    maxBuffer: 8 * 1024 * 1024,
+  });
+}
+
+function tryRun(args: string[]): {
+  stdout: string;
+  ok: boolean;
+  error?: Error;
+} {
+  try {
+    return { stdout: run(args), ok: true };
+  } catch (e) {
+    return { stdout: "", ok: false, error: e as Error };
+  }
+}
+
+/**
+ * Parse a single snapshot line of the form:
+ *   `@e4 [text-field] "Phone or email," [editable]`
+ *   `@e5 [button] "Continue"`
+ *   `@e2 [scroll-area] [scrollable]`
+ *
+ * The `agent-device` CLI's text format isn't a stable contract, so this
+ * parser is deliberately permissive: anything that doesn't fit the shape
+ * is dropped (and counted in nodeCount via the header line, not by
+ * counting parsed children — so we don't quietly hide drift).
+ */
+function parseNodeLine(line: string): SnapshotNode | null {
+  const refMatch = line.match(/^@(e\d+)\s+\[([a-z-]+)\]/);
+  if (!refMatch) {
+    return null;
+  }
+  const [, refIndex, kind] = refMatch;
+  const after = line.slice(refMatch[0].length).trim();
+
+  let text: string | undefined;
+  const textMatch = after.match(/^"((?:[^"\\]|\\.)*)"/);
+  if (textMatch) {
+    text = textMatch[1].replace(/,$/, "");
+  }
+
+  const flags = after.toLowerCase();
+  return {
+    ref: `@${refIndex}`,
+    kind,
+    text,
+    editable: flags.includes("[editable]"),
+    enabled: !flags.includes("[disabled]"),
+    scrollable: flags.includes("[scrollable]"),
+  };
+}
+
+export function parseSnapshot(raw: string): Snapshot {
+  const lines = raw.split("\n");
+  const nodes: SnapshotNode[] = [];
+  let page: string | undefined;
+  let app: string | undefined;
+  let nodeCount = 0;
+
+  for (const line of lines) {
+    if (line.startsWith("Page:")) {
+      page = line.slice("Page:".length).trim();
+      continue;
+    }
+    if (line.startsWith("App:")) {
+      app = line.slice("App:".length).trim();
+      continue;
+    }
+    const countMatch = line.match(/^Snapshot:\s*(\d+)/);
+    if (countMatch) {
+      nodeCount = Number(countMatch[1]);
+      continue;
+    }
+    const node = parseNodeLine(line.trim());
+    if (node) {
+      nodes.push(node);
+    }
+  }
+  return { page, app, nodes, nodeCount, raw };
+}
+
+export function parseAppState(raw: string): AppState {
+  const fg = raw.match(/Foreground app:\s*(\S+)/);
+  const act = raw.match(/Activity:\s*(\S+)/);
+  return { foregroundApp: fg?.[1], activity: act?.[1], raw };
+}
+
+/* ---- public surface used by the runner ------------------------------- */
+
+export function snapshot(): Snapshot {
+  return parseSnapshot(run(["snapshot", "-i", "--session", SESSION]));
+}
+
+export function screenshotBase64(path: string): string {
+  run(["screenshot", path, "--session", SESSION]);
+  /*
+   * The CLI writes to disk; the runner reads + base64-encodes itself
+   * (we keep this wrapper free of fs to keep the signatures simple).
+   */
+  return path;
+}
+
+export function appstate(): AppState {
+  return parseAppState(run(["appstate", "--session", SESSION]));
+}
+
+export function fill(ref: string, text: string): void {
+  run(["fill", ref, text, "--session", SESSION]);
+}
+
+export function press(ref: string): void {
+  run(["press", ref, "--session", SESSION]);
+}
+
+export function closeSession(): void {
+  /* Idempotent — if there's no session, this is a no-op. */
+  tryRun(["close", "--session", SESSION]);
+}
+
+/**
+ * @deprecated Prefer `platform.back()` / `platform.dismissKeyboard()`
+ * from `./agent-device-platform`. Kept exported because the skill-
+ * bundled `replay-only.ts` helper (on a separate branch) still
+ * imports it; the upstream-bound driver no longer calls this
+ * directly — keyevent dispatch is now platform-specific.
+ */
+export function adbKey(keyEvent: number): void {
+  execFileSync("adb", ["shell", "input", "keyevent", String(keyEvent)], {
+    timeout: CLI_TIMEOUT_MS,
+    encoding: "utf8",
+  });
+}
+
+/**
+ * Find nodes whose text contains the given substring (case-insensitive).
+ * Side-effect-free; operates on a snapshot already in memory.
+ */
+export function findInSnapshot(snap: Snapshot, needle: string): SnapshotNode[] {
+  const n = needle.toLowerCase();
+  return snap.nodes.filter((node) => node.text?.toLowerCase().includes(n));
+}
diff --git a/.github/scripts/agent-device-expect.ts b/.github/scripts/agent-device-expect.ts
new file mode 100644
index 000000000000..b50b8d97de86
--- /dev/null
+++ b/.github/scripts/agent-device-expect.ts
@@ -0,0 +1,87 @@
+/*
+ * `expect:` DSL — machine-checked postcondition for each test step.
+ *
+ * Why a tiny DSL instead of letting the LLM self-report success:
+ * `step_complete(rationale)` is an LLM claim, not evidence. A canary
+ * that trusts an LLM's claim is a canary the LLM can lie to. The
+ * `expect:` clause is evaluated by deterministic TypeScript code
+ * against the post-state snapshot/appstate. The step fails red if
+ * `expect:` fails, regardless of what the LLM said.
+ *
+ * Grammar (intentionally small — extend only when a real test step
+ * can't be expressed):
+ *   snapshot.contains_text("...")
+ *   snapshot.field_with_text("...").exists
+ *   appstate.foreground == "..."
+ *
+ * String literal: double-quoted, backslash-escapable. No interpolation,
+ * no regex, no boolean ops. If a step needs more, write a second step.
+ */
+
+import type { AppState, Snapshot } from "./agent-device-cli";
+
+export type ExpectResult = { ok: true } | { ok: false; reason: string };
+
+const STR = String.raw`"((?:[^"\\]|\\.)*)"`;
+
+const PATTERNS: Array<{
+  re: RegExp;
+  eval: (m: RegExpMatchArray, snap: Snapshot, app: AppState) => ExpectResult;
+}> = [
+  {
+    re: new RegExp(`^snapshot\\.contains_text\\(${STR}\\)$`),
+    eval: (m, snap) => {
+      const needle = m[1].toLowerCase();
+      const hit = snap.nodes.some((n) =>
+        n.text?.toLowerCase().includes(needle),
+      );
+      return hit
+        ? { ok: true }
+        : {
+            ok: false,
+            reason: `no node contains text ${JSON.stringify(m[1])} (snapshot has ${snap.nodes.length} nodes)`,
+          };
+    },
+  },
+  {
+    re: new RegExp(`^snapshot\\.field_with_text\\(${STR}\\)\\.exists$`),
+    eval: (m, snap) => {
+      const needle = m[1].toLowerCase();
+      const hit = snap.nodes.some(
+        (n) => n.editable && n.text?.toLowerCase().includes(needle),
+      );
+      return hit
+        ? { ok: true }
+        : {
+            ok: false,
+            reason: `no editable field contains text ${JSON.stringify(m[1])}`,
+          };
+    },
+  },
+  {
+    re: new RegExp(`^appstate\\.foreground\\s*==\\s*${STR}$`),
+    eval: (m, _snap, app) => {
+      return app.foregroundApp === m[1]
+        ? { ok: true }
+        : {
+            ok: false,
+            reason: `foreground app is ${app.foregroundApp ?? "(unknown)"}, expected ${m[1]}`,
+          };
+    },
+  },
+];
+
+export function evaluateExpect(
+  clause: string,
+  snap: Snapshot,
+  app: AppState,
+): ExpectResult {
+  const trimmed = clause.trim();
+  for (const p of PATTERNS) {
+    const m = trimmed.match(p.re);
+    if (m) {
+      return p.eval(m, snap, app);
+    }
+  }
+  return { ok: false, reason: `unrecognized expect clause: ${clause}` };
+}
diff --git a/.github/scripts/agent-device-llm-client.ts b/.github/scripts/agent-device-llm-client.ts
new file mode 100644
index 000000000000..96d449d9b63f
--- /dev/null
+++ b/.github/scripts/agent-device-llm-client.ts
@@ -0,0 +1,261 @@
+/*
+ * Thin client for the Anthropic /v1/messages endpoint.
+ *
+ * Decisions baked in:
+ *   - Direct `fetch` instead of `@anthropic-ai/sdk` to avoid a new
+ *     dependency on a CI-only path. Node 20 has fetch built in.
+ *   - Prompt caching (`cache_control: {type: "ephemeral"}`) on the
+ *     system message and the last tool definition. The system + tool
+ *     surface is static across the run, so cache hit rate after step 1
+ *     is ~100%, cutting per-call cost by 5-10x. The 5-minute TTL fits
+ *     a single CI run with margin.
+ *   - Bounded exponential backoff with jitter for 429/500/502/503/529.
+ *     The runner's caller decides what to do on final failure (typically
+ *     fall back to a deterministic bash-style assertion); this client
+ *     never silently degrades.
+ *   - Token budget kill-switch: total input+output tokens accumulated
+ *     across the run; throw if exceeded. Bounds runaway spend if a
+ *     prompt or tool design accidentally explodes context.
+ */
+
+export type AnthropicTool = {
+  name: string;
+  description: string;
+  input_schema: Record<string, unknown>;
+  cache_control?: { type: "ephemeral" };
+};
+
+export type AnthropicMessage = {
+  role: "user" | "assistant";
+  content: Array<
+    | { type: "text"; text: string }
+    | {
+        type: "image";
+        source: { type: "base64"; media_type: "image/png"; data: string };
+      }
+    | {
+        type: "tool_use";
+        id: string;
+        name: string;
+        input: Record<string, unknown>;
+      }
+    | {
+        type: "tool_result";
+        tool_use_id: string;
+        content: string;
+        is_error?: boolean;
+      }
+  >;
+};
+
+export type AnthropicResponse = {
+  id: string;
+  stop_reason:
+    | "end_turn"
+    | "tool_use"
+    | "max_tokens"
+    | "stop_sequence"
+    | string;
+  content: Array<
+    | { type: "text"; text: string }
+    | {
+        type: "tool_use";
+        id: string;
+        name: string;
+        input: Record<string, unknown>;
+      }
+  >;
+  usage: {
+    input_tokens: number;
+    output_tokens: number;
+    cache_read_input_tokens?: number;
+    cache_creation_input_tokens?: number;
+  };
+};
+
+export type ClientOptions = {
+  apiKey: string;
+  model: string;
+  tokenBudget: number;
+  /** Prefix written to artifacts/llm-trace.jsonl for post-mortem. */
+  traceWriter?: (entry: Record<string, unknown>) => void;
+};
+
+const ANTHROPIC_VERSION = "2023-06-01";
+const RETRY_DELAYS_MS = [1_000, 3_000, 9_000];
+const RETRYABLE_STATUS = new Set([429, 500, 502, 503, 529]);
+
+export class TokenBudgetExceededError extends Error {
+  constructor(used: number, budget: number) {
+    super(`token budget exceeded: ${used} > ${budget}`);
+  }
+}
+
+export class AnthropicCallFailedError extends Error {
+  constructor(
+    public readonly status: number,
+    public readonly body: string,
+  ) {
+    super(`Anthropic API failed with status ${status}: ${body.slice(0, 200)}`);
+  }
+}
+
+export class AnthropicClient {
+  private tokensUsed = 0;
+
+  constructor(private readonly opts: ClientOptions) {}
+
+  getTokensUsed(): number {
+    return this.tokensUsed;
+  }
+
+  async call(args: {
+    system: string;
+    tools: AnthropicTool[];
+    messages: AnthropicMessage[];
+    maxTokens?: number;
+  }): Promise<AnthropicResponse> {
+    /*
+     * Mark system + last tool as cacheable. Anthropic caches the
+     * contiguous prefix UP TO each `cache_control` marker, so two
+     * markers means "cache through end of system" and "cache
+     * through end of tools" as separate cached prefixes.
+     */
+    const cachedTools = args.tools.map((t, i) =>
+      i === args.tools.length - 1
+        ? { ...t, cache_control: { type: "ephemeral" as const } }
+        : t,
+    );
+
+    const body = {
+      model: this.opts.model,
+      max_tokens: args.maxTokens ?? 1024,
+      temperature: 0,
+      system: [
+        {
+          type: "text",
+          text: args.system,
+          cache_control: { type: "ephemeral" },
+        },
+      ],
+      tools: cachedTools,
+      messages: args.messages,
+    };
+
+    /*
+     * Verbose diagnostic mode: capture the full message thread + tool_use
+     * calls in the trace. Trade-off is artifact size and a small risk
+     * of leaking content the user typed; disabled unless DEBUG_LLM=1.
+     */
+    const verbose = (process.env.DEBUG_LLM ?? "") === "1";
+    if (verbose) {
+      const lastUser = args.messages
+        .slice()
+        .reverse()
+        .find((m) => m.role === "user");
+      const lastText = lastUser?.content.find(
+        (c): c is { type: "text"; text: string } => c.type === "text",
+      );
+      this.opts.traceWriter?.({
+        type: "request",
+        message_count: args.messages.length,
+        last_user_text: lastText?.text.slice(0, 1500) ?? null,
+        tool_uses_in_thread: args.messages.flatMap((m) =>
+          m.content
+            .filter(
+              (
+                c,
+              ): c is {
+                type: "tool_use";
+                id: string;
+                name: string;
+                input: Record<string, unknown>;
+              } => c.type === "tool_use",
+            )
+            .map((c) => ({ id: c.id, name: c.name, input: c.input })),
+        ),
+      });
+    }
+
+    let lastError: Error | undefined;
+    for (let attempt = 0; attempt <= RETRY_DELAYS_MS.length; attempt++) {
+      try {
+        const response = await this.callOnce(body);
+        this.accountForUsage(response.usage);
+        const baseEntry = {
+          type: "response",
+          attempt,
+          stop_reason: response.stop_reason,
+          usage: response.usage,
+        } as Record<string, unknown>;
+        if (verbose) {
+          baseEntry.tool_uses = response.content
+            .filter((c) => c.type === "tool_use")
+            .map((c) => ({
+              id: (c as { id: string }).id,
+              name: (c as { name: string }).name,
+              input: (c as { input: unknown }).input,
+            }));
+          baseEntry.text_preview = response.content
+            .filter((c) => c.type === "text")
+            .map((c) => (c as { text: string }).text.slice(0, 800));
+        }
+        this.opts.traceWriter?.(baseEntry);
+        return response;
+      } catch (e) {
+        lastError = e as Error;
+        if (e instanceof TokenBudgetExceededError) {
+          throw e;
+        }
+        const retryable =
+          e instanceof AnthropicCallFailedError &&
+          RETRYABLE_STATUS.has(e.status);
+        if (!retryable || attempt >= RETRY_DELAYS_MS.length) {
+          throw e;
+        }
+        const base = RETRY_DELAYS_MS[attempt];
+        const jitter = base * 0.3 * (Math.random() * 2 - 1);
+        const wait = Math.max(0, Math.round(base + jitter));
+        this.opts.traceWriter?.({
+          type: "retry",
+          attempt,
+          status: (e as AnthropicCallFailedError).status,
+          waitMs: wait,
+        });
+        await new Promise((r) => setTimeout(r, wait));
+      }
+    }
+    throw lastError ?? new Error("unreachable");
+  }
+
+  private async callOnce(body: object): Promise<AnthropicResponse> {
+    const res = await fetch("https://api.anthropic.com/v1/messages", {
+      method: "POST",
+      headers: {
+        "content-type": "application/json",
+        "x-api-key": this.opts.apiKey,
+        "anthropic-version": ANTHROPIC_VERSION,
+      },
+      body: JSON.stringify(body),
+    });
+    if (!res.ok) {
+      throw new AnthropicCallFailedError(res.status, await res.text());
+    }
+    return (await res.json()) as AnthropicResponse;
+  }
+
+  private accountForUsage(usage: AnthropicResponse["usage"]): void {
+    /*
+     * Cache reads cost roughly 10% of normal input tokens, but for
+     * budget-protection purposes we count them at face value —
+     * budgets are about runaway prompt design, not pricing.
+     */
+    this.tokensUsed += usage.input_tokens + usage.output_tokens;
+    if (this.tokensUsed > this.opts.tokenBudget) {
+      throw new TokenBudgetExceededError(
+        this.tokensUsed,
+        this.opts.tokenBudget,
+      );
+    }
+  }
+}
diff --git a/.github/scripts/agent-device-llm-driver.ts b/.github/scripts/agent-device-llm-driver.ts
new file mode 100644
index 000000000000..be8598218f90
--- /dev/null
+++ b/.github/scripts/agent-device-llm-driver.ts
@@ -0,0 +1,1375 @@
+/*
+ * Phase-1 LLM-driven Android smoke runner.
+ *
+ * Lifecycle inside the workflow's emulator-runner `script:` block:
+ *
+ *   1. Boot dance (deterministic, NOT LLM-driven):
+ *      - close any stale agent-device session
+ *      - locate dev APK from android/app/build/outputs/...
+ *      - adb install
+ *      - adb reverse tcp:8081 tcp:8081  (Metro reachable from emulator)
+ *      - npm start &  (Metro background)
+ *      - poll /status until packager-status:running
+ *      - agent-device open --relaunch  (cold start)
+ *
+ *   2. Test-case execution:
+ *      - parse test case (numbered steps + optional `expect:` lines)
+ *      - per step: cache-first / LLM-fallback / bash-fallback ladder
+ *      - assert post-state via `expect:` evaluator
+ *      - write artifacts (screenshots, snapshots, llm-trace, cache-diff)
+ *
+ *   3. Cleanup (always — even on signal/error):
+ *      - dump logcat once
+ *      - close agent-device session (so re-runs aren't tripped by the
+ *        "session already bound" guard)
+ *      - kill background jobs (Metro)
+ *
+ * Why a TS runner instead of Python or Bash:
+ *   - The repo already runs ts-node in CI (precedent: createDocsRoutes.ts).
+ *   - Reusing the snapshot parser + signature + expect DSL across
+ *     replay / LLM / bash paths means one source of truth for what
+ *     "the SignIn screen is on screen" means — a divergence between
+ *     "what bash sees" and "what the LLM sees" would be a class of
+ *     bugs we don't want.
+ */
+
+import { execFileSync } from "child_process";
+import fs from "fs";
+import path from "path";
+import * as adCli from "./agent-device-cli";
+import type { Snapshot, AppState } from "./agent-device-cli";
+import {
+  snapshotSignature,
+  refToLocator,
+  locatorToRef,
+} from "./agent-device-snapshot-signature";
+import { evaluateExpect } from "./agent-device-expect";
+import * as cache from "./agent-device-replay-cache";
+import type { CachedAction, CacheV1 } from "./agent-device-replay-cache";
+import {
+  AnthropicClient,
+  TokenBudgetExceededError,
+  AnthropicCallFailedError,
+} from "./agent-device-llm-client";
+import type {
+  AnthropicTool,
+  AnthropicMessage,
+} from "./agent-device-llm-client";
+import {
+  detectPlatform,
+  startMetro,
+  locateBundle,
+  backgroundPids,
+} from "./agent-device-platform";
+import type { Platform } from "./agent-device-platform";
+
+/* ---- config ----------------------------------------------------------- */
+
+const MODEL = process.env.ANTHROPIC_MODEL ?? "claude-sonnet-4-6";
+const TOKEN_BUDGET = Number(process.env.LLM_TOKEN_BUDGET ?? 200_000);
+const SESSION = process.env.AGENT_DEVICE_SESSION ?? "ci";
+const ARTIFACTS_DIR = process.env.ARTIFACTS_DIR ?? "artifacts";
+const TEST_CASE_PATH =
+  process.argv[2] ?? "tests/smoke/android-signin.testcase.txt";
+const CACHE_PATH =
+  process.env.LLM_CACHE_PATH ?? deriveCachePath(TEST_CASE_PATH);
+const platform: Platform = detectPlatform();
+const METRO_READY_TIMEOUT_MS = 120_000;
+/*
+ * 600s gives ~2× margin over Phase 0's observed 294s (warm AVD). The
+ * first run on a fresh AVD-cache key is closer to a cold boot since
+ * the prime+run happens in two separate emulator-runner invocations
+ * and the snapshot-load overhead lands inside this budget.
+ */
+const SIGNIN_LOAD_TIMEOUT_MS = 600_000;
+const BOOT_PROBE_INTERVAL_MS = 30_000;
+const STEP_WALL_CLOCK_BUDGET_MS = 60_000;
+const MAX_STATE_CHANGING_ACTIONS = 4;
+const SCREENSHOT_BUDGET_PER_RUN = 2;
+const TEXT_LENGTH_CAP = 200;
+/*
+ * DEBUG_LLM=1 makes both the LLM client (request/response bodies)
+ * and the runner (per-tool-dispatch entries) emit verbose entries to
+ * llm-trace.jsonl + stdout. Off by default to keep normal-run
+ * artifacts and CI stdout slim.
+ */
+const DEBUG_LLM = process.env.DEBUG_LLM === "1";
+
+/* ---- types ------------------------------------------------------------ */
+
+type Step = {
+  number: number;
+  text: string;
+  expect: string | null;
+  raw: string;
+};
+
+type ToolResultBlock = {
+  type: "tool_result";
+  tool_use_id: string;
+  content: string;
+  is_error?: boolean;
+};
+type ContentBlock =
+  | { type: "text"; text: string }
+  | {
+      type: "image";
+      source: { type: "base64"; media_type: "image/png"; data: string };
+    }
+  | ToolResultBlock;
+
+type ExecutedAction = CachedAction & { ref?: string };
+
+/* ---- entry point ------------------------------------------------------ */
+
+async function main(): Promise<void> {
+  fs.mkdirSync(ARTIFACTS_DIR, { recursive: true });
+  registerCleanup();
+
+  log(
+    `runner=${MODEL} test_case=${TEST_CASE_PATH} cache=${CACHE_PATH} budget=${TOKEN_BUDGET}`,
+  );
+
+  const testCaseRaw = fs.readFileSync(TEST_CASE_PATH, "utf8");
+  const testCaseHash = cache.hashText(testCaseRaw);
+  const steps = parseTestCase(testCaseRaw);
+  if (!steps.length) {
+    fail("test case has no steps");
+  }
+
+  const committed = cache.loadCache(CACHE_PATH, MODEL, testCaseHash);
+  const recorded: CacheV1 = {
+    version: 1,
+    model: MODEL,
+    testCaseHash,
+    steps: [],
+  };
+
+  await bootApp();
+
+  const apiKey = process.env.ANTHROPIC_API_KEY;
+  const llm = apiKey
+    ? new AnthropicClient({
+        apiKey,
+        model: MODEL,
+        tokenBudget: TOKEN_BUDGET,
+        traceWriter: (e) =>
+          fs.appendFileSync(
+            path.join(ARTIFACTS_DIR, "llm-trace.jsonl"),
+            `${JSON.stringify(e)}\n`,
+          ),
+      })
+    : null;
+  if (!llm) {
+    log(
+      "::warning::ANTHROPIC_API_KEY missing — every step will use bash fallback",
+    );
+  }
+
+  let cacheHits = 0;
+  let llmRuns = 0;
+  let bashRuns = 0;
+
+  for (const step of steps) {
+    const result = await executeStep(step, {
+      committed,
+      testCaseHash,
+      llm,
+      recorded,
+      stats: {
+        onCacheHit: () => cacheHits++,
+        onLLMRun: () => llmRuns++,
+        onBashRun: () => bashRuns++,
+      },
+    });
+    if (!result.ok) {
+      fail(`step ${step.number} failed: ${result.reason}`);
+    }
+  }
+
+  /*
+   * Always write the recorded cache diff, even if it's identical.
+   * Reviewers want to see a clean (no-op) diff to know the canary
+   * ran end-to-end without UI drift.
+   */
+  const diffText = cache.diff(committed, recorded);
+  fs.writeFileSync(
+    path.join(ARTIFACTS_DIR, "cache-diff.txt"),
+    `${diffText || "(no drift — cache up to date)"}\n`,
+  );
+  cache.writeCache(path.join(ARTIFACTS_DIR, "cache-recorded.json"), recorded);
+
+  log(
+    `::notice::smoke OK — cache_hits=${cacheHits} llm_runs=${llmRuns} bash_runs=${bashRuns} tokens=${llm?.getTokensUsed() ?? 0}`,
+  );
+
+  if (diffText) {
+    log(
+      "::warning::cache drift detected — copy artifacts/cache-recorded.json to tests/smoke/cache/<test>.json and commit",
+    );
+  }
+}
+
+/* ---- test case parser ------------------------------------------------- */
+
+function parseTestCase(raw: string): Step[] {
+  const steps: Step[] = [];
+  let cur: Step | null = null;
+  for (const lineRaw of raw.split("\n")) {
+    const line = lineRaw.trimEnd();
+    if (!line.trim() || line.trim().startsWith("#")) {
+      continue;
+    }
+    const m = line.match(/^(\d+)\.\s+(.*)$/);
+    if (m) {
+      if (cur) {
+        steps.push(cur);
+      }
+      cur = { number: Number(m[1]), text: m[2], expect: null, raw: line };
+      continue;
+    }
+    const ex = line.match(/^\s*expect:\s*(.+)$/);
+    if (ex && cur) {
+      cur.expect = ex[1];
+      cur.raw += `\n${line}`;
+    }
+  }
+  if (cur) {
+    steps.push(cur);
+  }
+  return steps;
+}
+
+/* ---- boot dance (matches Phase 0's bash) ------------------------------ */
+
+async function bootApp(): Promise<void> {
+  log(`boot: platform=${platform.name}`);
+  log("boot: closing stale session");
+  adCli.closeSession();
+
+  log("boot: locating app bundle");
+  const bundle = locateBundle(platform);
+  if (!bundle) {
+    fail(
+      `no app bundle (*${platform.appBundleSuffix}) found under ${platform.appBundleDir} — build step likely failed`,
+    );
+  }
+  log(`boot: installing ${bundle}`);
+  platform.install(bundle);
+
+  log("boot: setupNetworking");
+  platform.setupNetworking();
+
+  platform.preBootHardening();
+
+  log("boot: starting Metro");
+  startMetro(path.join(ARTIFACTS_DIR, "metro.log"));
+
+  await waitForMetro();
+
+  log("boot: agent-device open --relaunch");
+  platform.launch();
+
+  /*
+   * Bounded wait for the SignIn UI to hydrate. The LLM can technically
+   * poll for it itself in step 1, but on slow runners that would burn
+   * LLM budget on what's effectively boot-blocking emulator wait time.
+   * We dump a probe snapshot every 30s during the wait so post-mortem
+   * can see *what* the app was showing if the wait times out — the
+   * first run of this workflow had no such artifacts and the failure
+   * was undebuggable from the upload.
+   */
+  log("boot: waiting for SignIn UI");
+  const start = Date.now();
+  let probeIdx = 0;
+  let lastProbeAt = 0;
+  while (Date.now() - start < SIGNIN_LOAD_TIMEOUT_MS) {
+    let snap;
+    try {
+      snap = adCli.snapshot();
+    } catch (e) {
+      /*
+       * Don't let a single transient snapshot timeout kill the wait —
+       * the emulator may be under heavy load and the next poll will
+       * probably succeed.
+       */
+      log(
+        `boot: snapshot threw (${(e as Error).message.slice(0, 80)}); retrying`,
+      );
+      await sleep(2_000);
+      continue;
+    }
+    if (
+      snap.nodes.some((n) => n.text?.toLowerCase().includes("phone or email"))
+    ) {
+      log(
+        `boot: SignIn ready after ${Math.round((Date.now() - start) / 1000)}s`,
+      );
+      return;
+    }
+    /*
+     * Blocking-dialog recovery. Platform-specific detection +
+     * dismissal hides behind `tryDismissBlockingDialog`. Android:
+     * Pixel Launcher ANR dialog (Close app / Wait). iOS (PR B):
+     * system permission alerts. Either way, dismissed → force-
+     * relaunch the app so we don't poll against a half-initialised
+     * activity stuck behind the dismissed dialog.
+     */
+    if (platform.tryDismissBlockingDialog(snap)) {
+      log("boot: blocking dialog dismissed + app force-relaunched");
+      /* Give the process a moment to come back up before re-snapshotting. */
+      await sleep(3_000);
+      continue;
+    }
+    if (Date.now() - lastProbeAt >= BOOT_PROBE_INTERVAL_MS) {
+      const elapsed = Math.round((Date.now() - start) / 1000);
+      fs.writeFileSync(
+        path.join(
+          ARTIFACTS_DIR,
+          `boot-probe-${String(probeIdx).padStart(2, "0")}-t${elapsed}s.txt`,
+        ),
+        snap.raw,
+      );
+      probeIdx++;
+      lastProbeAt = Date.now();
+    }
+    await sleep(6_000);
+  }
+  /*
+   * Capture as much state as we can BEFORE failing so a re-run isn't
+   * required to debug. The cleanup trap will still write logcat after.
+   */
+  try {
+    const snap = adCli.snapshot();
+    fs.writeFileSync(
+      path.join(ARTIFACTS_DIR, "boot-timeout-snapshot.txt"),
+      snap.raw,
+    );
+    const app = adCli.appstate();
+    fs.writeFileSync(
+      path.join(ARTIFACTS_DIR, "boot-timeout-appstate.txt"),
+      app.raw,
+    );
+    adCli.screenshotBase64(path.join(ARTIFACTS_DIR, "boot-timeout.png"));
+  } catch (e) {
+    log(`boot: timeout-diagnostics capture failed: ${(e as Error).message}`);
+  }
+  fail(`SignIn UI not ready within ${SIGNIN_LOAD_TIMEOUT_MS / 1000}s`);
+}
+
+async function waitForMetro(): Promise<void> {
+  const start = Date.now();
+  while (Date.now() - start < METRO_READY_TIMEOUT_MS) {
+    try {
+      const out = execFileSync(
+        "curl",
+        ["-sf", "http://localhost:8081/status"],
+        { encoding: "utf8" },
+      );
+      if (out.includes("packager-status:running")) {
+        log(
+          `boot: Metro ready after ${Math.round((Date.now() - start) / 1000)}s`,
+        );
+        return;
+      }
+    } catch {
+      /* Metro not up yet */
+    }
+    await sleep(2_000);
+  }
+  fail(
+    `Metro did not reach packager-status:running within ${METRO_READY_TIMEOUT_MS / 1000}s`,
+  );
+}
+
+/* ---- per-step orchestration ------------------------------------------- */
+
+type StepCtx = {
+  committed: CacheV1;
+  testCaseHash: string;
+  llm: AnthropicClient | null;
+  recorded: CacheV1;
+  stats: {
+    onCacheHit: () => void;
+    onLLMRun: () => void;
+    onBashRun: () => void;
+  };
+};
+
+async function executeStep(
+  step: Step,
+  ctx: StepCtx,
+): Promise<{ ok: true } | { ok: false; reason: string }> {
+  const preSnap = adCli.snapshot();
+  const preSig = snapshotSignature(preSnap);
+  const stepKey = `step ${step.number}`;
+  log(`::group::${stepKey} — ${step.text}`);
+
+  fs.writeFileSync(
+    path.join(ARTIFACTS_DIR, `step-${step.number}-pre.txt`),
+    preSnap.raw,
+  );
+
+  const cached = cache.lookup(ctx.committed, step.number, preSig);
+  if (cached) {
+    log(`${stepKey}: cache hit (pre_sig=${preSig})`);
+    const replay = await replayCachedActions(cached.actions);
+    if (replay.ok) {
+      const post = await verifyPostState(step, cached.postSignature);
+      if (post.ok) {
+        ctx.stats.onCacheHit();
+        ctx.recorded.steps.push(cached);
+        fs.writeFileSync(
+          path.join(ARTIFACTS_DIR, `step-${step.number}-post.txt`),
+          post.snap.raw,
+        );
+        log(`::endgroup::`);
+        return { ok: true };
+      }
+      log(`${stepKey}: cache drift — ${post.reason}; falling through to LLM`);
+    } else {
+      log(
+        `${stepKey}: replay failed — ${replay.reason}; falling through to LLM`,
+      );
+    }
+  }
+
+  let actions: ExecutedAction[] = [];
+  if (ctx.llm) {
+    try {
+      const llmResult = await runLLMStep(step, ctx.llm);
+      if (!llmResult.ok) {
+        log(
+          `${stepKey}: LLM gave up — ${llmResult.reason}; trying bash fallback`,
+        );
+      } else {
+        ctx.stats.onLLMRun();
+        actions = llmResult.actions;
+      }
+    } catch (e) {
+      if (e instanceof TokenBudgetExceededError) {
+        return { ok: false, reason: e.message };
+      }
+      log(
+        `${stepKey}: LLM call failed (${(e as Error).message}); trying bash fallback`,
+      );
+    }
+  }
+
+  if (!actions.length) {
+    const bashResult = await runBashFallback(step);
+    if (!bashResult.ok) {
+      log(`::endgroup::`);
+      return { ok: false, reason: bashResult.reason };
+    }
+    ctx.stats.onBashRun();
+    actions = bashResult.actions;
+    /*
+     * Settle gap: agent-device fill returns once it has dispatched
+     * the typing command, but the on-device EditText needs a beat for
+     * React Native's onChange to fire and the accessibility tree to
+     * re-publish the new text. Without this, verifyPostState below
+     * takes a snapshot before the typed text has propagated and the
+     * expect predicate fails on what's transient lag, not a real
+     * problem.
+     */
+    await sleep(500);
+  }
+
+  const post = await verifyPostState(step, null);
+  if (!post.ok) {
+    log(`::endgroup::`);
+    return { ok: false, reason: post.reason };
+  }
+  fs.writeFileSync(
+    path.join(ARTIFACTS_DIR, `step-${step.number}-post.txt`),
+    post.snap.raw,
+  );
+
+  ctx.recorded.steps.push({
+    stepNumber: step.number,
+    stepTextHash: cache.hashText(step.text),
+    preSignature: preSig,
+    postSignature: snapshotSignature(post.snap),
+    actions: actions.map(stripExecutedRef),
+    expect: step.expect,
+    recordedAt: new Date().toISOString(),
+    runId: process.env.GITHUB_RUN_ID ?? "local",
+  });
+  log(`::endgroup::`);
+  return { ok: true };
+}
+
+function stripExecutedRef(a: ExecutedAction): CachedAction {
+  const { ref, ...rest } = a as ExecutedAction & { ref?: string };
+  return rest;
+}
+
+async function verifyPostState(
+  step: Step,
+  expectedSignature: string | null,
+): Promise<{ ok: true; snap: Snapshot } | { ok: false; reason: string }> {
+  const snap = adCli.snapshot();
+  const app = adCli.appstate();
+
+  /*
+   * Expect (when declared) is the source of truth: it's a deterministic
+   * predicate over the live UI, while the post-signature is a structural
+   * hash that can drift on cosmetic re-renders, animation timing, or
+   * node-ordering changes that don't affect what the user actually sees.
+   * If expect passes, the step succeeded — drift becomes advisory.
+   */
+  if (step.expect) {
+    const ev = evaluateExpect(step.expect, snap, app);
+    if (!ev.ok) {
+      return { ok: false, reason: `expect failed: ${ev.reason}` };
+    }
+    if (expectedSignature && snapshotSignature(snap) !== expectedSignature) {
+      log(
+        `::warning::post-signature drift but expect passed (recorded ${expectedSignature}, observed ${snapshotSignature(snap)}) — accepting`,
+      );
+    }
+    return { ok: true, snap };
+  }
+
+  /*
+   * No expect declared — fall back to signature equality so a cache-hit
+   * path still has *some* post-state check.
+   */
+  if (expectedSignature && snapshotSignature(snap) !== expectedSignature) {
+    return {
+      ok: false,
+      reason: `post-state signature drift (recorded ${expectedSignature}, observed ${snapshotSignature(snap)})`,
+    };
+  }
+  return { ok: true, snap };
+}
+
+/* ---- cache replay ----------------------------------------------------- */
+
+async function replayCachedActions(
+  actions: CachedAction[],
+): Promise<{ ok: true } | { ok: false; reason: string }> {
+  for (const action of actions) {
+    const ok = await dispatchCachedAction(action);
+    if (!ok.ok) {
+      return ok;
+    }
+    /*
+     * Tiny settle gap — even on warm runners, fill→press in
+     * immediate succession occasionally lands the press before
+     * React has propagated the fill.
+     */
+    await sleep(150);
+  }
+  return { ok: true };
+}
+
+async function dispatchCachedAction(
+  action: CachedAction,
+): Promise<{ ok: true } | { ok: false; reason: string }> {
+  if (action.tool === "wait") {
+    await sleep(action.ms);
+    return { ok: true };
+  }
+  if (action.tool === "wait_for") {
+    return await runWaitFor(action.predicate, action.timeoutMs);
+  }
+  if (action.tool === "back") {
+    platform.back();
+    return { ok: true };
+  }
+  if (action.tool === "dismiss_keyboard") {
+    platform.dismissKeyboard();
+    return { ok: true };
+  }
+  const snap = adCli.snapshot();
+  const ref = locatorToRef(snap, action.locator);
+  if (!ref) {
+    return {
+      ok: false,
+      reason: `cached locator did not resolve: ${JSON.stringify(action.locator)}`,
+    };
+  }
+  if (action.tool === "fill") {
+    adCli.fill(ref, action.text);
+    return { ok: true };
+  }
+  if (action.tool === "press") {
+    adCli.press(ref);
+    return { ok: true };
+  }
+  return {
+    ok: false,
+    reason: `unknown cached tool: ${(action as { tool: string }).tool}`,
+  };
+}
+
+async function runWaitFor(
+  predicate: string,
+  timeoutMs: number,
+): Promise<{ ok: true } | { ok: false; reason: string }> {
+  const start = Date.now();
+  while (Date.now() - start < timeoutMs) {
+    const snap = adCli.snapshot();
+    const app = adCli.appstate();
+    const ev = evaluateExpect(predicate, snap, app);
+    if (ev.ok) {
+      return { ok: true };
+    }
+    await sleep(250);
+  }
+  return {
+    ok: false,
+    reason: `wait_for timed out after ${timeoutMs}ms (predicate: ${predicate})`,
+  };
+}
+
+/* ---- LLM step --------------------------------------------------------- */
+
+const SYSTEM_PROMPT = [
+  "You are an autonomous mobile UI test runner driving the Expensify Android app via the agent-device CLI.",
+  "You receive: the current step description in plain English, an accessibility snapshot of the live UI, and a history of your tool calls within this step.",
+  "",
+  "Snapshot format: a JSON array of `{ref, kind, text, editable, enabled, scrollable}` nodes. Each ref is a stable handle for that node within this snapshot only — re-snapshot before reusing refs from a prior turn.",
+  "",
+  "Rules:",
+  "- Never invent a ref. Always pick refs from the most recent snapshot's `nodes` array.",
+  "- After any state-changing action (fill, press, back, dismiss_keyboard, wait), call snapshot to refresh before asserting.",
+  "- Use `assert` to prove a step succeeded — `step_complete` without an `assert` first is suspicious.",
+  "- Prefer `wait_for(predicate)` over `wait(ms)`. The bare wait is a last resort; the runner logs a warning each time it is used.",
+  "- Treat label text as advisory; it may be localized. Match by intent and element kind.",
+  "- If after 2-3 unique attempts you cannot make progress, call `step_failed` with a precise reason.",
+].join("\n");
+
+const TOOLS: AnthropicTool[] = [
+  {
+    name: "snapshot",
+    description:
+      "Capture a fresh accessibility tree. Returns {nodes: [...], node_count: number}. Call this after any state-changing action and before using a ref from a previous turn.",
+    input_schema: {
+      type: "object",
+      properties: {},
+      additionalProperties: false,
+    },
+  },
+  {
+    name: "screenshot",
+    description:
+      "Capture a PNG screenshot. Rate-limited to 2 calls per run; the runner may auto-attach a screenshot when a snapshot returns 0 nodes. Use this only when the snapshot is genuinely empty or when you've addressed phantom refs twice.",
+    input_schema: {
+      type: "object",
+      properties: {},
+      additionalProperties: false,
+    },
+  },
+  {
+    name: "find",
+    description:
+      "Search the most recent snapshot for nodes whose `text` contains the given substring (case-insensitive). Side-effect-free.",
+    input_schema: {
+      type: "object",
+      properties: { needle: { type: "string" } },
+      required: ["needle"],
+      additionalProperties: false,
+    },
+  },
+  {
+    name: "fill",
+    description: "Type text into the editable text-field at the given ref.",
+    input_schema: {
+      type: "object",
+      properties: { ref: { type: "string" }, text: { type: "string" } },
+      required: ["ref", "text"],
+      additionalProperties: false,
+    },
+  },
+  {
+    name: "press",
+    description: "Tap the pressable element at the given ref.",
+    input_schema: {
+      type: "object",
+      properties: { ref: { type: "string" } },
+      required: ["ref"],
+      additionalProperties: false,
+    },
+  },
+  {
+    name: "wait_for",
+    description:
+      'Poll snapshots until `predicate` is satisfied or `timeout_ms` elapses. Predicates: snapshot.contains_text("..."), snapshot.field_with_text("...").exists, appstate.foreground == "...".',
+    input_schema: {
+      type: "object",
+      properties: {
+        predicate: { type: "string" },
+        timeout_ms: { type: "integer", maximum: 10_000 },
+      },
+      required: ["predicate"],
+      additionalProperties: false,
+    },
+  },
+  {
+    name: "wait",
+    description:
+      "Sleep for the given number of milliseconds (max 2000). Last resort — prefer wait_for. The runner logs a warning each call.",
+    input_schema: {
+      type: "object",
+      properties: { ms: { type: "integer", minimum: 1, maximum: 2_000 } },
+      required: ["ms"],
+      additionalProperties: false,
+    },
+  },
+  {
+    name: "back",
+    description:
+      "Press Android back. Use to recover from an unintended screen.",
+    input_schema: {
+      type: "object",
+      properties: {},
+      additionalProperties: false,
+    },
+  },
+  {
+    name: "dismiss_keyboard",
+    description: "Dismiss the soft keyboard.",
+    input_schema: {
+      type: "object",
+      properties: {},
+      additionalProperties: false,
+    },
+  },
+  {
+    name: "assert",
+    description:
+      "Verify a postcondition. Returns {ok: bool, reason?: string}. Predicates as in wait_for.",
+    input_schema: {
+      type: "object",
+      properties: { predicate: { type: "string" } },
+      required: ["predicate"],
+      additionalProperties: false,
+    },
+  },
+  {
+    name: "appstate",
+    description: "Return {foreground_app, activity}.",
+    input_schema: {
+      type: "object",
+      properties: {},
+      additionalProperties: false,
+    },
+  },
+  {
+    name: "step_complete",
+    description:
+      "Mark the current step as passed. Must include a brief rationale describing what was observed (mention the assert you ran).",
+    input_schema: {
+      type: "object",
+      properties: { rationale: { type: "string" } },
+      required: ["rationale"],
+      additionalProperties: false,
+    },
+  },
+  {
+    name: "step_failed",
+    description:
+      "Mark the current step as failed. Use when 2-3 unique attempts have not produced progress, or when the screen does not match what the step expects.",
+    input_schema: {
+      type: "object",
+      properties: { reason: { type: "string" } },
+      required: ["reason"],
+      additionalProperties: false,
+    },
+  },
+];
+
+async function runLLMStep(
+  step: Step,
+  llm: AnthropicClient,
+): Promise<
+  { ok: true; actions: ExecutedAction[] } | { ok: false; reason: string }
+> {
+  const startedAt = Date.now();
+  let snap = adCli.snapshot();
+  let app = adCli.appstate();
+  let stateChanging = 0;
+  let phantomStreak = 0;
+  let attachScreenshotNext = false;
+  let screenshotsUsed = 0;
+  const seen = new Set<string>();
+  const messages: AnthropicMessage[] = [];
+  const executed: ExecutedAction[] = [];
+
+  while (
+    Date.now() - startedAt < STEP_WALL_CLOCK_BUDGET_MS &&
+    stateChanging <= MAX_STATE_CHANGING_ACTIONS
+  ) {
+    if (snap.nodeCount === 0 && screenshotsUsed < SCREENSHOT_BUDGET_PER_RUN) {
+      attachScreenshotNext = true;
+    }
+
+    const userBlocks: ContentBlock[] = [];
+    if (attachScreenshotNext && screenshotsUsed < SCREENSHOT_BUDGET_PER_RUN) {
+      const png = takeScreenshot(
+        `step-${step.number}-shot-${screenshotsUsed}.png`,
+      );
+      screenshotsUsed++;
+      attachScreenshotNext = false;
+      userBlocks.push({
+        type: "image",
+        source: { type: "base64", media_type: "image/png", data: png },
+      });
+    }
+    userBlocks.push({
+      type: "text",
+      text: buildUserText(step, snap, app, executed),
+    });
+    messages.push({ role: "user", content: userBlocks });
+
+    const response = await llm.call({
+      system: SYSTEM_PROMPT,
+      tools: TOOLS,
+      messages,
+    });
+    const assistantContent = response.content as AnthropicMessage["content"];
+    messages.push({ role: "assistant", content: assistantContent });
+
+    const toolUses = assistantContent.filter(
+      (
+        b,
+      ): b is Extract<
+        (typeof assistantContent)[number],
+        { type: "tool_use" }
+      > => b.type === "tool_use",
+    );
+    if (!toolUses.length) {
+      return { ok: false, reason: "LLM returned no tool calls" };
+    }
+
+    const toolResults: ToolResultBlock[] = [];
+    for (const tu of toolUses) {
+      const sigKey = `${tu.name}:${JSON.stringify(tu.input)}:${snapshotSignature(snap)}`;
+      if (seen.has(sigKey)) {
+        toolResults.push({
+          type: "tool_result",
+          tool_use_id: tu.id,
+          content:
+            "You already performed this exact action against this exact UI state and it produced no observable change. Try a different approach or call step_failed.",
+          is_error: true,
+        });
+        continue;
+      }
+      seen.add(sigKey);
+
+      try {
+        const out = await dispatchTool(tu.name, tu.input, {
+          snap,
+          app,
+          onSnap: (s) => {
+            snap = s;
+          },
+          onApp: (a) => {
+            app = a;
+          },
+          executed,
+          stepNumber: step.number,
+          onPhantom: () => {
+            phantomStreak++;
+            if (
+              phantomStreak >= 2 &&
+              screenshotsUsed < SCREENSHOT_BUDGET_PER_RUN
+            ) {
+              attachScreenshotNext = true;
+            }
+          },
+          resetPhantom: () => {
+            phantomStreak = 0;
+          },
+        });
+        if (isStateChangingTool(tu.name)) {
+          stateChanging++;
+        }
+        if (out.terminal === "complete") {
+          return { ok: true, actions: executed };
+        }
+        if (out.terminal === "failed") {
+          return {
+            ok: false,
+            reason: out.reason ?? "step_failed without reason",
+          };
+        }
+        toolResults.push({
+          type: "tool_result",
+          tool_use_id: tu.id,
+          content: out.content,
+          is_error: out.isError,
+        });
+      } catch (e) {
+        toolResults.push({
+          type: "tool_result",
+          tool_use_id: tu.id,
+          content: `tool error: ${(e as Error).message}`,
+          is_error: true,
+        });
+      }
+    }
+
+    messages.push({ role: "user", content: toolResults });
+
+    /*
+     * Refresh snap + appstate after every batch of tool calls that
+     * changed device state. Without this the LLM keeps seeing the
+     * pre-step snapshot even after its fill/press took effect, so
+     * identical fills get caught by the seen-hash dedup and the LLM
+     * burns its budget retrying actions it already performed.
+     * dispatchTool's snapshot/wait_for/back/dismiss callbacks already
+     * refresh; fill and press do not.
+     */
+    if (
+      toolUses.some(
+        (tu) => tu.name === "fill" || tu.name === "press" || tu.name === "wait",
+      )
+    ) {
+      try {
+        snap = adCli.snapshot();
+        app = adCli.appstate();
+      } catch (e) {
+        /* Transient — next loop iteration will retry implicitly. */
+        log(
+          `runLLMStep: post-action snap refresh threw (${(e as Error).message.slice(0, 80)}); continuing with stale snap`,
+        );
+      }
+    }
+  }
+
+  return {
+    ok: false,
+    reason: "wall-clock or distinct-action budget exhausted",
+  };
+}
+
+function isStateChangingTool(name: string): boolean {
+  return [
+    "fill",
+    "press",
+    "back",
+    "dismiss_keyboard",
+    "wait",
+    "wait_for",
+  ].includes(name);
+}
+
+function buildUserText(
+  step: Step,
+  snap: Snapshot,
+  app: AppState,
+  history: ExecutedAction[],
+): string {
+  const lines: string[] = [];
+  lines.push(`Current step: ${step.number}. ${step.text}`);
+  if (step.expect) {
+    lines.push(
+      `Postcondition the runner will check (NOT for you to call directly): ${step.expect}`,
+    );
+  }
+  if (history.length) {
+    const tail = history.slice(-3).map((h) => describeExecutedAction(h));
+    lines.push(`Recent actions you took: ${tail.join("; ")}`);
+  }
+  lines.push(
+    `appstate.foreground=${app.foregroundApp ?? "(unknown)"} activity=${app.activity ?? "(unknown)"}`,
+  );
+  lines.push(`snapshot.node_count=${snap.nodeCount}`);
+  lines.push("snapshot.nodes:");
+  lines.push(JSON.stringify(snap.nodes.map(scrubNodeForPrompt), null, 0));
+  return lines.join("\n");
+}
+
+function scrubNodeForPrompt(
+  n: Snapshot["nodes"][number],
+): Record<string, unknown> {
+  const text = n.text
+    ? sanitizeText(n.text).slice(0, TEXT_LENGTH_CAP)
+    : undefined;
+  return {
+    ref: n.ref,
+    kind: n.kind,
+    text,
+    editable: n.editable,
+    enabled: n.enabled,
+    scrollable: n.scrollable,
+  };
+}
+
+function sanitizeText(s: string): string {
+  let out = "";
+  for (const ch of s) {
+    const c = ch.charCodeAt(0);
+    if (c >= 0x20 || c === 0x09 || c === 0x0a) {
+      out += ch;
+    }
+  }
+  return out;
+}
+
+function describeExecutedAction(a: ExecutedAction): string {
+  if (a.tool === "fill") {
+    return `fill(${JSON.stringify(a.locator)}, "${a.text.slice(0, 30)}…")`;
+  }
+  if (a.tool === "press") {
+    return `press(${JSON.stringify(a.locator)})`;
+  }
+  if (a.tool === "wait_for") {
+    return `wait_for(${a.predicate}, ${a.timeoutMs}ms)`;
+  }
+  if (a.tool === "wait") {
+    return `wait(${a.ms}ms)`;
+  }
+  return a.tool;
+}
+
+/* ---- LLM tool dispatch ------------------------------------------------ */
+
+type DispatchCtx = {
+  snap: Snapshot;
+  app: AppState;
+  onSnap: (s: Snapshot) => void;
+  onApp: (a: AppState) => void;
+  executed: ExecutedAction[];
+  stepNumber: number;
+  onPhantom: () => void;
+  resetPhantom: () => void;
+};
+
+type DispatchResult = {
+  content: string;
+  isError?: boolean;
+  terminal?: "complete" | "failed";
+  reason?: string;
+};
+
+async function dispatchTool(
+  name: string,
+  input: Record<string, unknown>,
+  ctx: DispatchCtx,
+): Promise<DispatchResult> {
+  switch (name) {
+    case "snapshot": {
+      const s = adCli.snapshot();
+      ctx.onSnap(s);
+      return {
+        content: JSON.stringify({
+          node_count: s.nodeCount,
+          nodes: s.nodes.map(scrubNodeForPrompt),
+        }),
+      };
+    }
+    case "screenshot": {
+      const file = `step-${ctx.stepNumber}-llm-shot.png`;
+      const data = takeScreenshot(file);
+      return {
+        content: `screenshot saved at ${file} (${data.length} bytes base64). Re-snapshot to keep working with refs.`,
+      };
+    }
+    case "find": {
+      const needle = String(input.needle ?? "");
+      const matches = adCli.findInSnapshot(ctx.snap, needle).map((n) => ({
+        ref: n.ref,
+        kind: n.kind,
+        text: n.text,
+        editable: n.editable,
+      }));
+      return { content: JSON.stringify({ matches, count: matches.length }) };
+    }
+    case "fill": {
+      const ref = String(input.ref ?? "");
+      const text = String(input.text ?? "");
+      const node = ctx.snap.nodes.find((n) => n.ref === ref);
+      if (!node) {
+        ctx.onPhantom();
+        if (DEBUG_LLM) {
+          log(
+            `::debug::dispatch.fill phantom ref=${ref} text="${text.slice(0, 30)}…"`,
+          );
+        }
+        return {
+          content: `phantom ref ${ref} not in current snapshot`,
+          isError: true,
+        };
+      }
+      ctx.resetPhantom();
+      try {
+        adCli.fill(ref, text);
+      } catch (e) {
+        if (DEBUG_LLM) {
+          log(
+            `::debug::dispatch.fill THREW ref=${ref} text="${text.slice(0, 30)}…" err=${(e as Error).message.slice(0, 100)}`,
+          );
+        }
+        throw e;
+      }
+      const loc = refToLocator(ctx.snap, ref);
+      if (DEBUG_LLM) {
+        log(
+          `::debug::dispatch.fill ok ref=${ref} kind=${node.kind} loc=${JSON.stringify(loc)} text="${text.slice(0, 30)}…" executed_len_after=${ctx.executed.length + (loc ? 1 : 0)}`,
+        );
+      }
+      if (loc) {
+        ctx.executed.push({ tool: "fill", locator: loc, text, ref });
+      }
+      return { content: `filled ${ref}` };
+    }
+    case "press": {
+      const ref = String(input.ref ?? "");
+      const node = ctx.snap.nodes.find((n) => n.ref === ref);
+      if (!node) {
+        ctx.onPhantom();
+        if (DEBUG_LLM) {
+          log(`::debug::dispatch.press phantom ref=${ref}`);
+        }
+        return {
+          content: `phantom ref ${ref} not in current snapshot`,
+          isError: true,
+        };
+      }
+      ctx.resetPhantom();
+      try {
+        adCli.press(ref);
+      } catch (e) {
+        if (DEBUG_LLM) {
+          log(
+            `::debug::dispatch.press THREW ref=${ref} err=${(e as Error).message.slice(0, 100)}`,
+          );
+        }
+        throw e;
+      }
+      const loc = refToLocator(ctx.snap, ref);
+      if (DEBUG_LLM) {
+        log(
+          `::debug::dispatch.press ok ref=${ref} kind=${node.kind} loc=${JSON.stringify(loc)} executed_len_after=${ctx.executed.length + (loc ? 1 : 0)}`,
+        );
+      }
+      if (loc) {
+        ctx.executed.push({ tool: "press", locator: loc, ref });
+      }
+      return { content: `pressed ${ref}` };
+    }
+    case "wait": {
+      const ms = Math.min(2_000, Math.max(1, Number(input.ms ?? 0)));
+      log(`::warning::LLM used wait(${ms}) — prefer wait_for`);
+      await sleep(ms);
+      ctx.executed.push({ tool: "wait", ms });
+      return { content: `slept ${ms}ms` };
+    }
+    case "wait_for": {
+      const predicate = String(input.predicate ?? "");
+      const timeoutMs = Math.min(
+        10_000,
+        Math.max(250, Number(input.timeout_ms ?? 5_000)),
+      );
+      const r = await runWaitFor(predicate, timeoutMs);
+      ctx.executed.push({ tool: "wait_for", predicate, timeoutMs });
+      ctx.onSnap(adCli.snapshot());
+      ctx.onApp(adCli.appstate());
+      return {
+        content: r.ok
+          ? "predicate satisfied"
+          : `wait_for timed out: ${r.reason}`,
+        isError: !r.ok,
+      };
+    }
+    case "back":
+      platform.back();
+      ctx.executed.push({ tool: "back" });
+      ctx.onSnap(adCli.snapshot());
+      return { content: "back pressed" };
+    case "dismiss_keyboard":
+      platform.dismissKeyboard();
+      ctx.executed.push({ tool: "dismiss_keyboard" });
+      ctx.onSnap(adCli.snapshot());
+      return { content: "keyboard dismissed" };
+    case "assert": {
+      const predicate = String(input.predicate ?? "");
+      const ev = evaluateExpect(predicate, ctx.snap, ctx.app);
+      return { content: JSON.stringify(ev), isError: !ev.ok };
+    }
+    case "appstate": {
+      const a = adCli.appstate();
+      ctx.onApp(a);
+      return { content: JSON.stringify(a) };
+    }
+    case "step_complete":
+      return { content: "step accepted by runner", terminal: "complete" };
+    case "step_failed":
+      return {
+        content: "step rejected by LLM",
+        terminal: "failed",
+        reason: String(input.reason ?? "no reason given"),
+      };
+    default:
+      return { content: `unknown tool: ${name}`, isError: true };
+  }
+}
+
+function takeScreenshot(filename: string): string {
+  const p = path.join(ARTIFACTS_DIR, filename);
+  adCli.screenshotBase64(p);
+  return fs.readFileSync(p).toString("base64");
+}
+
+/* ---- bash fallback ---------------------------------------------------- */
+
+/*
+ * Mirrors Phase 0's bash logic for the SignIn flow. Used when:
+ *   - ANTHROPIC_API_KEY is missing
+ *   - The Anthropic API exhausts retries with HTTP errors
+ *   - The LLM gives up via step_failed (rare; mostly defensive)
+ *
+ * Only the SignIn-flow steps are covered. Adding a new test case
+ * without LLM access requires extending this map. That's intentional:
+ * the bash fallback is a safety net for known flows, not a generic
+ * drop-in for the LLM.
+ */
+
+async function runBashFallback(
+  step: Step,
+): Promise<
+  { ok: true; actions: ExecutedAction[] } | { ok: false; reason: string }
+> {
+  const text = step.text.toLowerCase();
+
+  if (text.includes("wait") && text.includes("signin")) {
+    /* Boot dance already gated on this; an instant pass is fine. */
+    return { ok: true, actions: [] };
+  }
+
+  if (text.includes("enter") && text.includes("email")) {
+    const m = step.text.match(/"([^"]+)"/);
+    if (!m) {
+      return {
+        ok: false,
+        reason: "bash fallback could not extract email from step text",
+      };
+    }
+    const snap = adCli.snapshot();
+    const field = snap.nodes.find(
+      (n) =>
+        n.editable &&
+        (n.kind === "text-field" ||
+          (n.text?.toLowerCase().includes("phone") ?? false)),
+    );
+    if (!field) {
+      return {
+        ok: false,
+        reason: "bash fallback: no editable text-field for email entry",
+      };
+    }
+    adCli.fill(field.ref, m[1]);
+    const loc = refToLocator(snap, field.ref);
+    return {
+      ok: true,
+      actions: loc ? [{ tool: "fill", locator: loc, text: m[1] }] : [],
+    };
+  }
+
+  if (text.includes("press") && text.includes("continue")) {
+    const snap = adCli.snapshot();
+    const btn = snap.nodes.find(
+      (n) => n.kind === "button" && n.text?.toLowerCase().includes("continue"),
+    );
+    if (!btn) {
+      return { ok: false, reason: "bash fallback: no Continue button found" };
+    }
+    adCli.press(btn.ref);
+    const loc = refToLocator(snap, btn.ref);
+    return { ok: true, actions: loc ? [{ tool: "press", locator: loc }] : [] };
+  }
+
+  if (text.includes("magic")) {
+    const start = Date.now();
+    while (Date.now() - start < 60_000) {
+      const snap = adCli.snapshot();
+      if (
+        snap.nodes.some((n) => n.text?.toLowerCase().includes("magic code"))
+      ) {
+        return {
+          ok: true,
+          actions: [
+            {
+              tool: "wait_for",
+              predicate: 'snapshot.contains_text("Magic code")',
+              timeoutMs: 60_000,
+            },
+          ],
+        };
+      }
+      await sleep(2_000);
+    }
+    return {
+      ok: false,
+      reason: "bash fallback: magic-code screen never appeared",
+    };
+  }
+
+  return {
+    ok: false,
+    reason: `bash fallback has no recipe for step text: ${step.text}`,
+  };
+}
+
+/* ---- cleanup ---------------------------------------------------------- */
+
+let cleanedUp = false;
+
+function registerCleanup(): void {
+  const handler = (): void => {
+    if (cleanedUp) {
+      return;
+    }
+    cleanedUp = true;
+    platform.dumpLogsToFile(path.join(ARTIFACTS_DIR, "logcat.txt"));
+    adCli.closeSession();
+    for (const pid of backgroundPids) {
+      try {
+        process.kill(-pid, "SIGTERM");
+      } catch {
+        /* already gone */
+      }
+    }
+  };
+  process.on("exit", handler);
+  process.on("SIGINT", () => {
+    handler();
+    process.exit(130);
+  });
+  process.on("SIGTERM", () => {
+    handler();
+    process.exit(143);
+  });
+}
+
+/* ---- helpers ---------------------------------------------------------- */
+
+function deriveCachePath(testCasePath: string): string {
+  const base = path.basename(testCasePath, path.extname(testCasePath));
+  return path.join("tests", "smoke", "cache", `${base}.json`);
+}
+
+function log(msg: string): void {
+  process.stdout.write(`${msg}\n`);
+}
+
+function sleep(ms: number): Promise<void> {
+  return new Promise((r) => setTimeout(r, ms));
+}
+
+function fail(reason: string): never {
+  log(`::error::${reason}`);
+  process.exit(1);
+}
+
+main().catch((e: unknown) => {
+  if (e instanceof AnthropicCallFailedError) {
+    fail(`anthropic API failed: ${e.status} ${e.body.slice(0, 200)}`);
+  }
+  fail(`runner crashed: ${(e as Error).stack ?? String(e)}`);
+});
diff --git a/.github/scripts/agent-device-platform.ts b/.github/scripts/agent-device-platform.ts
new file mode 100644
index 000000000000..2bf816683c87
--- /dev/null
+++ b/.github/scripts/agent-device-platform.ts
@@ -0,0 +1,351 @@
+/*
+ * Platform abstraction for the LLM-driven smoke driver.
+ *
+ * The per-step LLM loop, cache replay, expect predicate evaluator,
+ * signature hashing, and Anthropic client are platform-agnostic.
+ * Boot dance, blocking-dialog recovery, and a small set of keyevent
+ * tools are NOT. This module lifts the latter behind a tiny
+ * `Platform` interface so a new platform (iOS) can be added without
+ * touching the driver core.
+ *
+ * The current file ships ONE implementation — `AndroidPlatform` —
+ * which is a verbatim move of today's inlined logic in
+ * agent-device-llm-driver.ts. PR A is a refactor with zero behavior
+ * change; the matching Android fork-test run must produce the same
+ * artifacts as before. PR B (a follow-up) adds `IOSPlatform`.
+ */
+
+import { execFileSync, spawn } from "child_process";
+import fs from "fs";
+import path from "path";
+
+import * as adCli from "./agent-device-cli";
+import type { Snapshot } from "./agent-device-cli";
+
+/* ---- shared types ---------------------------------------------------- */
+
+export type PlatformName = "android" | "ios";
+
+/**
+ * Operations the driver delegates to a Platform impl. Everything not
+ * listed here is shared across platforms and stays in the driver.
+ */
+export interface Platform {
+  readonly name: PlatformName;
+
+  /** App bundle / package identifier passed to `agent-device open`. */
+  readonly appPackage: string;
+
+  /**
+   * Directory the runner searches for the installable bundle
+   * (APK on Android, .app on iOS). The first matching entry wins.
+   */
+  readonly appBundleDir: string;
+  readonly appBundleSuffix: string;
+
+  /**
+   * One-shot install of the located bundle. Throws on hard failure;
+   * the driver surfaces the error to the workflow log.
+   */
+  install(bundlePath: string): void;
+
+  /**
+   * Best-effort networking prep so Metro on the host is reachable
+   * from the device/sim. Android needs `adb reverse`; iOS Sim
+   * shares host loopback and this is a no-op.
+   */
+  setupNetworking(): void;
+
+  /**
+   * Best-effort pre-launch hardening — disable autofill, suppress
+   * system error dialogs, etc. Implementations should swallow
+   * failures (a missing setting on a fresh AVD is fine).
+   */
+  preBootHardening(): void;
+
+  /**
+   * Launch the app via `agent-device open --relaunch`. Handles the
+   * platform-specific `--platform`/`--serial`/`--device` flag set.
+   */
+  launch(): void;
+
+  /**
+   * Force a clean relaunch — used by blocking-dialog recovery.
+   * Android: `am force-stop` + relaunch. iOS: `xcrun simctl
+   * terminate` + relaunch.
+   */
+  forceRelaunch(): void;
+
+  /**
+   * Detect a system-modal "blocking" dialog over the app. Android's
+   * ANR dialog and iOS's permission alerts share the shape: a
+   * small handful of system buttons whose conservative choice
+   * lets the app continue. Returns true if dismissed.
+   */
+  tryDismissBlockingDialog(snap: Snapshot): boolean;
+
+  /**
+   * Map LLM-facing `back()` / `dismiss_keyboard()` tool calls to
+   * platform-specific keyevents.
+   */
+  back(): void;
+  dismissKeyboard(): void;
+
+  /**
+   * Dump device logs to the given file. Called by the driver's
+   * cleanup trap on exit. Best-effort — missing logs must not
+   * fail the run.
+   */
+  dumpLogsToFile(outPath: string): void;
+}
+
+/* ---- session constant shared by all platforms ------------------------ */
+
+const SESSION = process.env.AGENT_DEVICE_SESSION ?? "ci";
+
+/* ---- Android implementation ----------------------------------------- */
+
+class AndroidPlatform implements Platform {
+  readonly name = "android" as const;
+  readonly appPackage = process.env.APP_PACKAGE ?? "com.expensify.chat.dev";
+  readonly appBundleDir = "android/app/build/outputs/apk/development/debug";
+  readonly appBundleSuffix = ".apk";
+
+  install(apkPath: string): void {
+    execFileSync("adb", ["install", "-r", "-d", "-t", apkPath], {
+      stdio: "inherit",
+    });
+  }
+
+  setupNetworking(): void {
+    execFileSync("adb", ["reverse", "tcp:8081", "tcp:8081"], {
+      stdio: "inherit",
+    });
+  }
+
+  preBootHardening(): void {
+    /*
+     * Suppress system ANR dialogs. Without this, the Pixel
+     * Launcher's "isn't responding" dialog covers our app on
+     * the 2-core ubuntu-latest runner during heavy boot load.
+     * The underlying ANR still happens but the foreground app
+     * keeps running uncovered.
+     */
+    try {
+      execFileSync(
+        "adb",
+        ["shell", "settings", "put", "global", "hide_error_dialogs", "1"],
+        { timeout: 5_000, stdio: "ignore" },
+      );
+    } catch {
+      /* best effort */
+    }
+
+    /*
+     * Disable Android Autofill globally. Without this, the
+     * framework silently populates editable fields when they
+     * gain focus and a credential is cached on the AVD —
+     * cache recording then misses the fill action and replay
+     * breaks on a different AVD snapshot.
+     */
+    try {
+      execFileSync(
+        "adb",
+        ["shell", "settings", "put", "secure", "autofill_service", "null"],
+        { timeout: 5_000, stdio: "ignore" },
+      );
+    } catch {
+      /* best effort */
+    }
+  }
+
+  launch(): void {
+    execFileSync(
+      "agent-device",
+      [
+        "open",
+        this.appPackage,
+        "--platform",
+        "android",
+        "--serial",
+        this.getSerial(),
+        "--session",
+        SESSION,
+        "--relaunch",
+      ],
+      { stdio: "inherit" },
+    );
+  }
+
+  forceRelaunch(): void {
+    try {
+      execFileSync("adb", ["shell", "am", "force-stop", this.appPackage], {
+        timeout: 5_000,
+        stdio: "ignore",
+      });
+    } catch (e) {
+      // Surface to caller via log line; not fatal.
+      process.stdout.write(
+        `platform.android: force-stop failed: ${(e as Error).message.slice(0, 80)}\n`,
+      );
+    }
+    try {
+      execFileSync(
+        "agent-device",
+        [
+          "open",
+          this.appPackage,
+          "--platform",
+          "android",
+          "--serial",
+          this.getSerial(),
+          "--session",
+          SESSION,
+          "--relaunch",
+        ],
+        { timeout: 30_000, stdio: "ignore" },
+      );
+    } catch (e) {
+      process.stdout.write(
+        `platform.android: relaunch failed: ${(e as Error).message.slice(0, 80)}\n`,
+      );
+    }
+  }
+
+  tryDismissBlockingDialog(snap: Snapshot): boolean {
+    /*
+     * Android ANR dialog signature: exactly two buttons labelled
+     * "Close app" and "Wait". The label varies slightly
+     * (Pixel Launcher / com.android.systemui / etc.) but the
+     * structural fingerprint stays.
+     */
+    const buttons = snap.nodes.filter((n) => n.kind === "button");
+    if (buttons.length !== 2) {
+      return false;
+    }
+    const labels = buttons.map((b) => b.text?.toLowerCase() ?? "").sort();
+    if (labels[0] !== "close app" || labels[1] !== "wait") {
+      return false;
+    }
+    try {
+      const waitBtn = snap.nodes.find(
+        (n) => n.kind === "button" && n.text?.toLowerCase() === "wait",
+      );
+      if (waitBtn) {
+        adCli.press(waitBtn.ref);
+      }
+    } catch (e) {
+      process.stdout.write(
+        `platform.android: dismiss press failed: ${(e as Error).message.slice(0, 80)}\n`,
+      );
+    }
+    this.forceRelaunch();
+    return true;
+  }
+
+  back(): void {
+    execFileSync("adb", ["shell", "input", "keyevent", "4"], {
+      timeout: 30_000,
+      encoding: "utf8",
+    });
+  }
+
+  dismissKeyboard(): void {
+    execFileSync("adb", ["shell", "input", "keyevent", "111"], {
+      timeout: 30_000,
+      encoding: "utf8",
+    });
+  }
+
+  dumpLogsToFile(outPath: string): void {
+    try {
+      execFileSync(
+        "adb",
+        [
+          "logcat",
+          "-d",
+          "-v",
+          "time",
+          "*:W",
+          "ReactNativeJS:V",
+          "ReactNative:V",
+        ],
+        {
+          stdio: ["ignore", fs.openSync(outPath, "w"), "ignore"],
+        },
+      );
+    } catch {
+      /* best effort */
+    }
+  }
+
+  private getSerial(): string {
+    return execFileSync("adb", ["get-serialno"], { encoding: "utf8" }).trim();
+  }
+}
+
+/* ---- factory --------------------------------------------------------- */
+
+/**
+ * Selects a Platform implementation. `PLATFORM` env var wins; defaults
+ * to 'android' for backwards compatibility with Phase 1.
+ */
+export function detectPlatform(): Platform {
+  const envName = (process.env.PLATFORM ?? "").toLowerCase().trim();
+  if (envName === "ios") {
+    throw new Error(
+      "PLATFORM=ios requested but IOSPlatform is not implemented in this PR (Phase 2 PR A). It lands in PR B.",
+    );
+  }
+  if (envName === "android" || envName === "") {
+    return new AndroidPlatform();
+  }
+  throw new Error(
+    `unsupported PLATFORM='${envName}'; expected 'android' or 'ios'`,
+  );
+}
+
+/* ---- background process tracking ------------------------------------ */
+
+/**
+ * Tracks PIDs the driver spawns (e.g. Metro) so the cleanup trap can
+ * terminate them on exit. Exported so the driver and the cleanup
+ * handler share state without circular imports.
+ */
+export const backgroundPids: number[] = [];
+
+/**
+ * Starts Metro and tracks its PID. Identical across platforms — both
+ * Android and iOS dev builds fetch the JS bundle from
+ * `http://localhost:8081/...`.
+ */
+export function startMetro(metroLogPath: string): void {
+  const metroLog = fs.openSync(metroLogPath, "a");
+  const metro = spawn("npm", ["start"], {
+    stdio: ["ignore", metroLog, metroLog],
+    detached: true,
+  });
+  metro.unref();
+  if (metro.pid) {
+    backgroundPids.push(metro.pid);
+  }
+}
+
+/**
+ * Resolve the installable bundle path under `platform.appBundleDir`.
+ * Returns the first match by name (sorted alphabetically), or null.
+ * The driver decides how to report a missing bundle.
+ */
+export function locateBundle(platform: Platform): string | null {
+  if (!fs.existsSync(platform.appBundleDir)) {
+    return null;
+  }
+  const files = fs
+    .readdirSync(platform.appBundleDir)
+    .filter((f) => f.endsWith(platform.appBundleSuffix))
+    .sort();
+  if (!files.length) {
+    return null;
+  }
+  return path.join(platform.appBundleDir, files[0]);
+}
diff --git a/.github/scripts/agent-device-replay-cache.ts b/.github/scripts/agent-device-replay-cache.ts
new file mode 100644
index 000000000000..1f60599b3d53
--- /dev/null
+++ b/.github/scripts/agent-device-replay-cache.ts
@@ -0,0 +1,122 @@
+/*
+ * Replay cache for the LLM-driven smoke.
+ *
+ * Without this cache, every PR run pays the LLM round-trip cost on
+ * every step. Worse, every run is non-deterministic. With it, the
+ * happy path costs ~$0 and runs deterministically; only when the
+ * snapshot signature changes (real UI shape change) do we fall
+ * through to the LLM.
+ *
+ * The cache file lives at `tests/smoke/cache/<test-case>.json` and
+ * is committed. The diff in code review is the human-readable
+ * signal that "the SignIn UI shape changed" — the property
+ * reviewers want to see.
+ */
+
+import { createHash } from "crypto";
+import fs from "fs";
+import path from "path";
+import type { RoleLocator } from "./agent-device-snapshot-signature";
+
+export type CachedAction =
+  | { tool: "fill"; locator: RoleLocator; text: string }
+  | { tool: "press"; locator: RoleLocator }
+  | { tool: "back" }
+  | { tool: "dismiss_keyboard" }
+  | { tool: "wait"; ms: number }
+  | { tool: "wait_for"; predicate: string; timeoutMs: number };
+
+export type CachedStep = {
+  stepNumber: number;
+  stepTextHash: string;
+  preSignature: string;
+  postSignature: string;
+  actions: CachedAction[];
+  expect: string | null;
+  recordedAt: string;
+  runId: string;
+};
+
+export type CacheV1 = {
+  version: 1;
+  model: string;
+  testCaseHash: string;
+  steps: CachedStep[];
+};
+
+export function hashText(s: string): string {
+  return createHash("sha256").update(s).digest("hex").slice(0, 16);
+}
+
+export function loadCache(
+  filePath: string,
+  model: string,
+  testCaseHash: string,
+): CacheV1 {
+  if (!fs.existsSync(filePath)) {
+    return { version: 1, model, testCaseHash, steps: [] };
+  }
+  const raw = JSON.parse(fs.readFileSync(filePath, "utf8")) as CacheV1;
+  if (raw.version !== 1) {
+    throw new Error(
+      `Cache version mismatch at ${filePath}: expected 1, got ${raw.version}`,
+    );
+  }
+  return raw;
+}
+
+/**
+ * Cache hit requires three things to line up:
+ *   1. test_case_hash — the test file itself hasn't been edited
+ *   2. step_number    — we're at the right step in the sequence
+ *   3. pre_signature  — we're staring at the same UI shape we recorded
+ *
+ * If any drift, we fall through to the LLM and (on success) the
+ * runner emits a cache-diff to artifacts. The PR check fails red,
+ * forcing the contributor to commit the updated cache.
+ */
+export function lookup(
+  cache: CacheV1,
+  stepNumber: number,
+  preSignature: string,
+): CachedStep | null {
+  return (
+    cache.steps.find(
+      (s) => s.stepNumber === stepNumber && s.preSignature === preSignature,
+    ) ?? null
+  );
+}
+
+export function diff(committed: CacheV1, recorded: CacheV1): string {
+  const lines: string[] = [];
+  for (const s of recorded.steps) {
+    const prior = committed.steps.find((c) => c.stepNumber === s.stepNumber);
+    if (!prior) {
+      lines.push(
+        `+ step ${s.stepNumber}: NEW (pre=${s.preSignature}, post=${s.postSignature})`,
+      );
+      continue;
+    }
+    if (prior.preSignature !== s.preSignature) {
+      lines.push(
+        `~ step ${s.stepNumber}: pre_signature ${prior.preSignature} → ${s.preSignature}`,
+      );
+    }
+    if (prior.postSignature !== s.postSignature) {
+      lines.push(
+        `~ step ${s.stepNumber}: post_signature ${prior.postSignature} → ${s.postSignature}`,
+      );
+    }
+    if (JSON.stringify(prior.actions) !== JSON.stringify(s.actions)) {
+      lines.push(
+        `~ step ${s.stepNumber}: actions changed (${prior.actions.length} → ${s.actions.length})`,
+      );
+    }
+  }
+  return lines.join("\n");
+}
+
+export function writeCache(filePath: string, cache: CacheV1): void {
+  fs.mkdirSync(path.dirname(filePath), { recursive: true });
+  fs.writeFileSync(filePath, `${JSON.stringify(cache, null, 2)}\n`);
+}
diff --git a/.github/scripts/agent-device-snapshot-signature.ts b/.github/scripts/agent-device-snapshot-signature.ts
new file mode 100644
index 000000000000..d6681863e6e3
--- /dev/null
+++ b/.github/scripts/agent-device-snapshot-signature.ts
@@ -0,0 +1,122 @@
+/*
+ * Structural signature of a UI snapshot.
+ *
+ * The signature is the cache key for the replay system: cache hits replay
+ * recorded actions, cache misses fall back to the LLM. For that to work,
+ * the signature must be:
+ *
+ *   1. STABLE across cosmetic UI changes — locale rotation, A/B copy
+ *      tests, visible user data, dynamic timestamps. We exclude visible
+ *      `text` content for this reason. A label changing from
+ *      "Continue" to "Submit" must NOT bust the cache (the replay layer
+ *      finds the button by role + position, then the LLM recovery layer
+ *      handles a real shape change if any).
+ *
+ *   2. SENSITIVE to structural change — a new button appearing, an
+ *      input becoming non-editable, a screen transitioning to a
+ *      different layout. These are the events that invalidate a
+ *      recorded action sequence.
+ *
+ * Net effect: localization or copy churn doesn't trigger an LLM call,
+ * but real UI shape change does.
+ */
+
+import { createHash } from "crypto";
+import type { Snapshot, SnapshotNode } from "./agent-device-cli";
+
+function project(node: SnapshotNode): string {
+  return [
+    node.kind,
+    node.text ? "T1" : "T0",
+    node.editable ? "E1" : "E0",
+    node.enabled ? "N1" : "N0",
+    node.scrollable ? "S1" : "S0",
+  ].join("|");
+}
+
+/**
+ * Transient nodes the signature must ignore.
+ *
+ * React Native dev-mode renders an inline "!, <warning>" bubble for
+ * runtime warnings (StrictMode, dev-only assertions, etc.). These
+ * appear and disappear between runs depending on bundler timing and
+ * warning suppression state — same screen, different node count.
+ * Runs 25659967543 and 25662443061 produced different signatures on
+ * an identical SignIn screen because one had 3 extra dev-warning
+ * nodes the other didn't, and cache replay never landed.
+ *
+ * These warnings are dev-only, never reach release builds, and never
+ * mean anything to a user — exactly the kind of cosmetic node the
+ * structural signature should disregard.
+ */
+function isTransientDevWarning(node: SnapshotNode): boolean {
+  if (!node.text) {
+    return false;
+  }
+  if (node.kind === "group" && node.text.startsWith("!, ")) {
+    return true;
+  }
+  if (node.kind === "text" && node.text === "!") {
+    return true;
+  }
+  if (
+    node.kind === "text" &&
+    node.text.startsWith("Open debugger to view warnings")
+  ) {
+    return true;
+  }
+  if (
+    node.kind === "text" &&
+    node.text.startsWith("The result of getSnapshot")
+  ) {
+    return true;
+  }
+  return false;
+}
+
+export function snapshotSignature(snap: Snapshot): string {
+  const projected = snap.nodes
+    .filter((n) => !isTransientDevWarning(n))
+    .map(project)
+    .join("\n");
+  return createHash("sha256").update(projected).digest("hex").slice(0, 16);
+}
+
+/**
+ * Locator that survives across runs even though `@eN` refs do not.
+ * The runner re-resolves to a concrete `@ref` against the live
+ * snapshot at replay time.
+ *
+ * Example: `{kind: "text-field", index: 0, editable: true}` →
+ * "the first editable text-field in the current snapshot".
+ */
+export type RoleLocator = {
+  kind: string;
+  index: number;
+  editable?: boolean;
+};
+
+export function refToLocator(snap: Snapshot, ref: string): RoleLocator | null {
+  const sameKind = snap.nodes.filter(
+    (n) => n.kind === snap.nodes.find((m) => m.ref === ref)?.kind,
+  );
+  const idx = sameKind.findIndex((n) => n.ref === ref);
+  if (idx < 0) {
+    return null;
+  }
+  const node = sameKind[idx];
+  return { kind: node.kind, index: idx, editable: node.editable || undefined };
+}
+
+export function locatorToRef(snap: Snapshot, loc: RoleLocator): string | null {
+  const matches = snap.nodes.filter((n) => {
+    if (n.kind !== loc.kind) {
+      return false;
+    }
+    if (loc.editable !== undefined && n.editable !== loc.editable) {
+      return false;
+    }
+    return true;
+  });
+  return matches[loc.index]?.ref ?? null;
+}
diff --git a/.github/workflows/smokeAndroidLLM.yml b/.github/workflows/smokeAndroidLLM.yml
new file mode 100644
index 000000000000..4c32238f5dba
--- /dev/null
+++ b/.github/workflows/smokeAndroidLLM.yml
@@ -0,0 +1,156 @@
+name: Android Smoke (agent-device · Phase 1, LLM-driven)
+
+# Phase-1 build-health canary: same emulator + APK + boot dance as
+# Phase 0, but the test steps are now plain English and an LLM driver
+# (Claude Sonnet) figures out which agent-device calls to make. A
+# committed replay cache at tests/smoke/cache/<test>.json keeps the
+# happy path deterministic and ~$0 in API spend; cache misses fall
+# back to the LLM, and final-tier failures (API down, LLM gives up)
+# fall back to a deterministic Phase-0-style bash recipe so an
+# Anthropic outage doesn't fail the build.
+#
+# Initial rollout: continue-on-error: true so this is non-blocking
+# while we compare reliability against Phase 0 over a 2-week window.
+# Once flake rate <= Phase 0's, flip to required and retire Phase 0.
+
+on:
+  pull_request:
+    types: [opened, synchronize]
+    branches-ignore: [staging, production]
+    # Don't ignore tests/ or .github/ — Phase 1 fires on changes to
+    # the test cases, the runner scripts, and the workflow itself.
+    paths-ignore:
+      - docs/**
+      - help/**
+      - contributingGuides/**
+      - "**.md"
+  workflow_dispatch:
+
+concurrency:
+  group: smoke-android-llm-${{ github.head_ref || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  smoke:
+    name: Android emulator smoke (LLM-driven)
+    if: ${{ github.actor != 'OSBotify' }}
+    # Non-blocking during the rollout window. The recommendation in the
+    # Phase 1 plan is to flip this to false (or remove the line) after
+    # 2 weeks if Phase 1's flake rate <= Phase 0's.
+    continue-on-error: true
+    runs-on: blacksmith-4vcpu-ubuntu-2404
+    timeout-minutes: 35
+    env:
+      AGENT_DEVICE_VERSION: "0.14.7"
+      # Hard kill-switch: total input+output tokens accumulated across
+      # the run. Bounds runaway spend if a prompt or tool design
+      # accidentally explodes context. ~$1 worst-case at sonnet 4.6
+      # rates without prompt cache; in practice with prompt cache the
+      # happy path uses 5-10x less.
+      LLM_TOKEN_BUDGET: "200000"
+      ANTHROPIC_MODEL: "claude-sonnet-4-6"
+
+    steps:
+      - name: Checkout
+        # v6
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        with:
+          submodules: true
+          token: ${{ secrets.OS_BOTIFY_TOKEN }}
+
+      - name: Verify KVM / fix permissions if needed
+        run: |
+          if ! ls -la /dev/kvm 2>/dev/null; then
+            echo "::error::No /dev/kvm on this runner — emulator will fall back to TCG and the job will time out"
+            exit 1
+          fi
+          if [ ! -w /dev/kvm ]; then
+            echo 'KERNEL=="kvm", GROUP="kvm", MODE="0666", OPTIONS+="static_node=kvm"' \
+              | sudo tee /etc/udev/rules.d/99-kvm4all.rules
+            sudo udevadm control --reload-rules
+            sudo udevadm trigger --name-match=kvm
+          fi
+
+      - name: Setup Java
+        uses: actions/setup-java@3a4f6e1af504cf6a31855fa899c6aa5355ba6c12
+        with:
+          distribution: temurin
+          java-version: "17"
+
+      - name: Setup Node
+        uses: ./.github/actions/composite/setupNode
+        with:
+          IS_HYBRID_BUILD: "false"
+
+      - name: Install agent-device CLI
+        run: npm install -g "agent-device@${AGENT_DEVICE_VERSION}"
+
+      - name: Configure AWS credentials (Rock S3 cache)
+        uses: aws-actions/configure-aws-credentials@8df5847569e6427dd6c4fb1cf565c83acfa8afa7
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: us-east-1
+
+      - name: Configure MapBox SDK
+        run: ./scripts/setup-mapbox-sdk.sh ${{ secrets.MAPBOX_SDK_DOWNLOAD_TOKEN }}
+
+      - name: Install Android CMake 3.30.5 (Hermes pins this exact version)
+        run: |
+          yes | "$ANDROID_HOME/cmdline-tools/latest/bin/sdkmanager" --licenses > /dev/null 2>&1 || true
+          "$ANDROID_HOME/cmdline-tools/latest/bin/sdkmanager" --install "cmake;3.30.5" 2>&1 | tail -5
+
+      - name: Build / fetch developmentDebug APK via Rock
+        env:
+          STANDALONE_NEW_DOT: "true"
+        run: npx rock build:android --variant developmentDebug
+
+      - name: AVD cache
+        uses: actions/cache@9255dc7a253b0ccc959486e2bca901246202afeb
+        id: avd-cache
+        with:
+          path: |
+            ~/.android/avd/*
+            ~/.android/adb*
+            ~/.android/adbkey
+            ~/.android/adbkey.pub
+          key: avd-pixel8-api35-x86_64-v1-${{ hashFiles('.github/workflows/smokeAndroidLLM.yml') }}
+
+      - name: Prime AVD snapshot (cache miss only)
+        if: steps.avd-cache.outputs.cache-hit != 'true'
+        uses: reactivecircus/android-emulator-runner@v2
+        with:
+          api-level: 35
+          target: google_apis
+          arch: x86_64
+          profile: pixel_8
+          force-avd-creation: false
+          emulator-options: -no-window -gpu swiftshader_indirect -noaudio -no-boot-anim
+          disable-animations: false
+          script: |
+            adb wait-for-device
+            until [ -n "$(adb shell getprop sys.boot_completed | tr -d '\r')" ]; do sleep 2; done
+            echo "AVD primed"
+
+      - name: Run smoke (LLM-driven)
+        uses: reactivecircus/android-emulator-runner@v2
+        env:
+          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+        with:
+          api-level: 35
+          target: google_apis
+          arch: x86_64
+          profile: pixel_8
+          force-avd-creation: false
+          emulator-options: -no-snapshot-save -no-window -gpu swiftshader_indirect -noaudio -no-boot-anim
+          disable-animations: true
+          script: npm run smoke:android:llm
+
+      - name: Upload artifacts
+        if: always()
+        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
+        with:
+          name: smoke-android-llm-${{ github.run_id }}-${{ github.run_attempt }}
+          path: artifacts/
+          if-no-files-found: warn
+          retention-days: 14
diff --git a/package.json b/package.json
index 3681ee193d4e..c526ddc80729 100644
--- a/package.json
+++ b/package.json
@@ -35,6 +35,7 @@
     "createDocsRoutes": "ts-node .github/scripts/createDocsRoutes.ts",
     "generateAllowedUrls": "ts-node .github/scripts/generateAllowedUrls.ts",
     "detectRedirectCycle": "ts-node .github/scripts/detectRedirectCycle.ts",
+    "smoke:android:llm": "ts-node .github/scripts/agent-device-llm-driver.ts",
     "ios-build": "bundle exec fastlane ios build_unsigned",
     "ios-hybrid-build": "bundle exec fastlane ios build_unsigned_hybrid",
     "android-build": "bundle exec fastlane android build_local",
diff --git a/tests/smoke/android-signin.testcase.txt b/tests/smoke/android-signin.testcase.txt
new file mode 100644
index 000000000000..b0b9add4c232
--- /dev/null
+++ b/tests/smoke/android-signin.testcase.txt
@@ -0,0 +1,24 @@
+# Phase-1 LLM-driven Android smoke — SignIn flow.
+#
+# Each step is plain English the LLM reads to decide what UI actions to
+# take. The optional `expect:` line is a machine-checked postcondition
+# evaluated by the runner (NOT the LLM) after the step's tool calls
+# complete; it is what gives the canary a hard pass/fail signal
+# independent of the LLM's self-assessment.
+#
+# Expect predicates supported (see .github/scripts/agent-device-expect.ts):
+#   snapshot.contains_text("...")
+#   snapshot.field_with_text("...").exists
+#   appstate.foreground == "..."
+
+1. Wait for the app to fully load and the SignIn screen to appear.
+   expect: snapshot.contains_text("Phone or email")
+
+2. Enter "rustam.zeinalov@callstack.com" into the email/phone field.
+   expect: snapshot.field_with_text("rustam.zeinalov@callstack.com").exists
+
+3. Press the Continue button.
+   expect: appstate.foreground == "com.expensify.chat.dev"
+
+4. Wait for the magic-code screen to appear.
+   expect: snapshot.contains_text("Magic code")
diff --git a/tests/smoke/cache/android-signin.testcase.json b/tests/smoke/cache/android-signin.testcase.json
new file mode 100644
index 000000000000..4dffe68f81f7
--- /dev/null
+++ b/tests/smoke/cache/android-signin.testcase.json
@@ -0,0 +1,74 @@
+{
+  "version": 1,
+  "model": "claude-sonnet-4-6",
+  "testCaseHash": "377c89ecd3182b95",
+  "steps": [
+    {
+      "stepNumber": 1,
+      "stepTextHash": "eefe04289ed44849",
+      "preSignature": "bada22fec79afdc7",
+      "postSignature": "bada22fec79afdc7",
+      "actions": [],
+      "expect": "snapshot.contains_text(\"Phone or email\")",
+      "recordedAt": "2026-05-11T10:29:23.672Z",
+      "runId": "25659967543"
+    },
+    {
+      "stepNumber": 2,
+      "stepTextHash": "988a4a6e077e6dc6",
+      "preSignature": "bada22fec79afdc7",
+      "postSignature": "04ba1966c1ae1c5f",
+      "actions": [
+        {
+          "tool": "fill",
+          "locator": {
+            "kind": "text-field",
+            "index": 0,
+            "editable": true
+          },
+          "text": "rustam.zeinalov@callstack.com"
+        }
+      ],
+      "expect": "snapshot.field_with_text(\"rustam.zeinalov@callstack.com\").exists",
+      "recordedAt": "2026-05-11T10:29:23.673Z",
+      "runId": "25659967543"
+    },
+    {
+      "stepNumber": 3,
+      "stepTextHash": "a071b334a6b8c0f4",
+      "preSignature": "04ba1966c1ae1c5f",
+      "postSignature": "33d1e5d0787b275a",
+      "actions": [
+        {
+          "tool": "press",
+          "locator": {
+            "kind": "button",
+            "index": 0
+          }
+        },
+        {
+          "tool": "dismiss_keyboard"
+        }
+      ],
+      "expect": "appstate.foreground == \"com.expensify.chat.dev\"",
+      "recordedAt": "2026-05-11T10:29:23.673Z",
+      "runId": "25659967543"
+    },
+    {
+      "stepNumber": 4,
+      "stepTextHash": "a1059919be2f42c9",
+      "preSignature": "33d1e5d0787b275a",
+      "postSignature": "33d1e5d0787b275a",
+      "actions": [
+        {
+          "tool": "wait_for",
+          "predicate": "snapshot.contains_text(\"Magic code\")",
+          "timeoutMs": 60000
+        }
+      ],
+      "expect": "snapshot.contains_text(\"Magic code\")",
+      "recordedAt": "2026-05-11T10:29:23.673Z",
+      "runId": "25659967543"
+    }
+  ]
+}