diff --git a/docs/remote-connect/feishu-bot-setup.md b/docs/remote-connect/feishu-bot-setup.md new file mode 100644 index 000000000..c4a50a708 --- /dev/null +++ b/docs/remote-connect/feishu-bot-setup.md @@ -0,0 +1,77 @@ +# Feishu Bot Setup Guide + +[中文](./feishu-bot-setup.zh-CN.md) + +Use this guide to pair BitFun through a Feishu bot. + +## Setup Steps + +### Step 1 + +Open the Feishu Developer Platform and log in: + + + +### Step 2 + +Create a custom app. + +### Step 3 + +Add the bot feature: + +Features - Bot - Add + +### Step 4 + +Add permission scopes: + +Permissions & Scopes - Add Scopes - Search for `im:` - Select all scopes that do not require approval - Add Scopes + +### Step 5 + +Copy the app credentials: + +Credentials & Basic Info - App ID and App Secret + +### Step 6 + +Open BitFun and start the Feishu bot connection: + +Remote Connect - IM Bot - Feishu Bot - Fill in App ID and App Secret - Connect + +### Step 7 + +Return to the Feishu Developer Platform. + +### Step 8 + +Configure event subscriptions: + +Events & callbacks - Event configuration - Subscription mode - Persistent connection - Save + +Then add message events: + +Add Events - Search for `im.message` - Select all - Confirm + +### Step 9 + +Configure callback subscriptions: + +Events & callbacks - Callback configuration - Subscription mode - Persistent connection - Save + +Then add card action callbacks: + +Add callback - Search for `card.action.trigger` - Select it - Confirm + +### Step 10 + +Publish the bot. + +### Step 11 + +Open Feishu, search for the bot name, open the chat, enter any message, and send it. + +### Step 12 + +Enter the 6-digit pairing code shown in BitFun Desktop, send it, and wait for the connection to succeed. diff --git a/docs/remote-connect/feishu-bot-setup.zh-CN.md b/docs/remote-connect/feishu-bot-setup.zh-CN.md new file mode 100644 index 000000000..475bb178e --- /dev/null +++ b/docs/remote-connect/feishu-bot-setup.zh-CN.md @@ -0,0 +1,77 @@ +# 飞书机器人配置指南 + +[English](./feishu-bot-setup.md) + +适用于通过飞书机器人完成 BitFun 远程连接配对。 + +## 配置步骤 + +### 第一步 + +打开飞书开发者平台并登录: + + + +### 第二步 + +创建企业自建应用。 + +### 第三步 + +添加机器人能力: + +添加应用能力 - 机器人 - 添加 + +### 第四步 + +开通权限: + +权限管理 - 开通权限 - 搜索 `im:` - 选择所有免审权限 - 确认开通权限 + +### 第五步 + +复制应用凭证: + +凭证与基础信息 - App ID 和 App Secret + +### 第六步 + +打开 BitFun 并启动飞书机器人连接: + +远程连接 - IM 机器人 - 飞书机器人 - 填写 App ID 和 App Secret - 连接 + +### 第七步 + +回到飞书开发者平台。 + +### 第八步 + +配置事件订阅: + +事件与回调 - 事件配置 - 订阅方式 - 使用长连接接收事件 - 保存 + +然后添加消息事件: + +添加事件 - 搜索 `im.message` - 全选 - 确认添加 + +### 第九步 + +配置回调订阅: + +事件与回调 - 回调配置 - 订阅方式 - 使用长连接接收事件 - 保存 + +然后添加卡片动作回调: + +添加回调 - 搜索 `card.action.trigger` - 选中 - 确认添加 + +### 第十步 + +发布机器人。 + +### 第十一步 + +打开飞书应用,搜索机器人名称,点击机器人打开对话框,输入任意消息并发送。 + +### 第十二步 + +输入 BitFun Desktop 显示的 6 位配对码,发送后等待连接成功。 diff --git a/src/crates/core/src/agentic/agents/claw_mode.rs b/src/crates/core/src/agentic/agents/claw_mode.rs index 4dc337142..a0c07bf29 100644 --- a/src/crates/core/src/agentic/agents/claw_mode.rs +++ b/src/crates/core/src/agentic/agents/claw_mode.rs @@ -32,10 +32,9 @@ impl ClawMode { "SessionMessage".to_string(), "SessionHistory".to_string(), "Cron".to_string(), - // All control capabilities (desktop, browser, app, terminal, system) - // are unified under the ControlHub tool. Use ControlHub for - // screenshot, click, click_element, mouse_move, scroll, drag, - // locate, browser CDP automation, and BitFun self-UI control. + // Browser, terminal, and routing metadata live under ControlHub. + // Local desktop/system control is delegated to the ComputerUse + // agent/tool instead of being surfaced as a ControlHub domain. "ControlHub".to_string(), ], } diff --git a/src/crates/core/src/agentic/agents/computer_use_mode.rs b/src/crates/core/src/agentic/agents/computer_use_mode.rs new file mode 100644 index 000000000..13dad0530 --- /dev/null +++ b/src/crates/core/src/agentic/agents/computer_use_mode.rs @@ -0,0 +1,80 @@ +//! Computer Use sub-agent +//! +//! Dedicated agent for perceiving and operating the user's local computer. + +use super::Agent; +use async_trait::async_trait; + +pub struct ComputerUseMode { + default_tools: Vec, +} + +impl Default for ComputerUseMode { + fn default() -> Self { + Self::new() + } +} + +impl ComputerUseMode { + pub fn new() -> Self { + Self { + default_tools: vec![ + "AskUserQuestion".to_string(), + "TodoWrite".to_string(), + "Skill".to_string(), + "Bash".to_string(), + "TerminalControl".to_string(), + "ControlHub".to_string(), + "ComputerUse".to_string(), + ], + } + } +} + +#[async_trait] +impl Agent for ComputerUseMode { + fn as_any(&self) -> &dyn std::any::Any { + self + } + + fn id(&self) -> &str { + "ComputerUse" + } + + fn name(&self) -> &str { + "Computer Use" + } + + fn description(&self) -> &str { + "Dedicated desktop automation agent for perceiving the local environment and operating apps, browsers, and OS UI" + } + + fn prompt_template_name(&self, _model_name: Option<&str>) -> &str { + "computer_use_mode" + } + + fn default_tools(&self) -> Vec { + self.default_tools.clone() + } + + fn is_readonly(&self) -> bool { + false + } +} + +#[cfg(test)] +mod tests { + use super::{Agent, ComputerUseMode}; + + #[test] + fn computer_use_mode_basics() { + let agent = ComputerUseMode::new(); + assert_eq!(agent.id(), "ComputerUse"); + assert_eq!(agent.name(), "Computer Use"); + assert_eq!(agent.prompt_template_name(None), "computer_use_mode"); + assert!(agent.default_tools().contains(&"ControlHub".to_string())); + assert!(agent.default_tools().contains(&"ComputerUse".to_string())); + assert!(!agent.default_tools().contains(&"Write".to_string())); + assert!(!agent.is_readonly()); + } +} diff --git a/src/crates/core/src/agentic/agents/cowork_mode.rs b/src/crates/core/src/agentic/agents/cowork_mode.rs index 8a493bc7f..f6eea9786 100644 --- a/src/crates/core/src/agentic/agents/cowork_mode.rs +++ b/src/crates/core/src/agentic/agents/cowork_mode.rs @@ -59,7 +59,7 @@ impl Agent for CoworkMode { } fn description(&self) -> &str { - "Collaborative mode: clarify first, track progress lightly, verify outcomes" + "Office and collaboration mode for documents, research, drafting, and structured multi-step work" } fn prompt_template_name(&self, _model_name: Option<&str>) -> &str { diff --git a/src/crates/core/src/agentic/agents/deep_research_agent.rs b/src/crates/core/src/agentic/agents/deep_research_agent.rs index 6f4a94a95..abc892d71 100644 --- a/src/crates/core/src/agentic/agents/deep_research_agent.rs +++ b/src/crates/core/src/agentic/agents/deep_research_agent.rs @@ -30,6 +30,7 @@ impl DeepResearchAgent { // Terminal — run commands to gather data (e.g. git log, curl, jq) "Bash".to_string(), "TerminalControl".to_string(), + "ControlHub".to_string(), // Task tracking "TodoWrite".to_string(), ], @@ -85,6 +86,7 @@ mod tests { assert!(tools.contains(&"Write".to_string())); assert!(tools.contains(&"Bash".to_string())); assert!(tools.contains(&"TerminalControl".to_string())); + assert!(tools.contains(&"ControlHub".to_string())); } #[test] diff --git a/src/crates/core/src/agentic/agents/mod.rs b/src/crates/core/src/agentic/agents/mod.rs index c6276e327..7dd3e4f30 100644 --- a/src/crates/core/src/agentic/agents/mod.rs +++ b/src/crates/core/src/agentic/agents/mod.rs @@ -8,6 +8,7 @@ mod registry; // Modes mod agentic_mode; mod claw_mode; +mod computer_use_mode; mod cowork_mode; mod debug_mode; mod plan_mode; @@ -26,6 +27,7 @@ pub use agentic_mode::AgenticMode; use async_trait::async_trait; pub use claw_mode::ClawMode; pub use code_review_agent::CodeReviewAgent; +pub use computer_use_mode::ComputerUseMode; pub use cowork_mode::CoworkMode; pub use custom_subagents::{CustomSubagent, CustomSubagentKind}; pub use debug_mode::DebugMode; diff --git a/src/crates/core/src/agentic/agents/prompt_builder/prompt_builder_impl.rs b/src/crates/core/src/agentic/agents/prompt_builder/prompt_builder_impl.rs index 924583f5f..33f5fb9ee 100644 --- a/src/crates/core/src/agentic/agents/prompt_builder/prompt_builder_impl.rs +++ b/src/crates/core/src/agentic/agents/prompt_builder/prompt_builder_impl.rs @@ -454,9 +454,9 @@ Do not read from, modify, create, move, or delete files outside this workspace u if self.context.supports_image_understanding == Some(false) { result.push_str( "\n\n# Computer use (text-only primary model)\n\n\ -The configured **primary model does not accept image inputs**. When using **`ControlHub`** with **`domain: \"desktop\"`** (or **`domain: \"browser\"`**):\n\ +The configured **primary model does not accept image inputs**. When using **`ComputerUse`** (or **`ControlHub`** with **`domain: \"browser\"`**):\n\ - **Do not** use **`screenshot`** (desktop) and **avoid** `domain:\"browser\" action:\"screenshot\"` — the JPEG bytes will be unreadable.\n\ -- **ACTION PRIORITY:** 1) Terminal/CLI/system commands (`Bash` tool, or `ControlHub domain:\"system\" action:\"run_script\"`) 2) Keyboard shortcuts (**`key_chord`**, **`type_text`**) 3) UI control: **`click_element`** (AX) → **`locate`** → **`move_to_text`** (use **`move_to_text_match_index`** when multiple OCR hits listed) → **`mouse_move`** (**`use_screen_coordinates`: true** with coordinates from tool JSON) → **`click`**. For browser work prefer `snapshot` → click by `@e*` ref over screenshots.\n\ +- **ACTION PRIORITY:** 1) Terminal/CLI/system commands (`Bash` tool, or `ComputerUse` `run_script`) 2) Keyboard shortcuts (**`key_chord`**, **`type_text`**) 3) UI control: **`click_element`** (AX) → **`locate`** → **`move_to_text`** (use **`move_to_text_match_index`** when multiple OCR hits listed) → **`mouse_move`** (**`use_screen_coordinates`: true** with coordinates from tool JSON) → **`click`**. For browser work prefer `snapshot` → click by `@e*` ref over screenshots.\n\ - **Never guess coordinates** — always use precise methods (AX, OCR, system coordinates from tool results, or browser snapshot refs).\n", ); } diff --git a/src/crates/core/src/agentic/agents/prompts/claw_mode.md b/src/crates/core/src/agentic/agents/prompts/claw_mode.md index 0d326766e..f80ed3638 100644 --- a/src/crates/core/src/agentic/agents/prompts/claw_mode.md +++ b/src/crates/core/src/agentic/agents/prompts/claw_mode.md @@ -5,39 +5,46 @@ Your main goal is to follow the USER's instructions at each message, denoted by Tool results and user messages may include tags. These tags contain useful information and reminders. Please heed them, but don't mention them in your response to the user. {LANGUAGE_PREFERENCE} + # Tool Call Style -Default: do not narrate routine, low-risk tool calls (just call the tool). -Narrate only when it helps: multi-step work, complex/challenging problems, sensitive actions (e.g., deletions), or when the user explicitly asks. -Keep narration brief and value-dense; avoid repeating obvious steps. -Use plain human language for narration unless in a technical context. + +Default: do not narrate routine, low-risk tool calls. Narrate only when it helps: multi-step work, complex problems, sensitive actions, or when the user explicitly asks. + When a first-class tool exists for an action, use the tool directly instead of asking the user to run equivalent CLI commands. -**Computer use (desktop automation):** When doing desktop automation, prefer script/command-line automation where possible, but execute steps ONE AT A TIME (like you would with GUI automation), not in a single huge script. -For script automation: -- **Step-by-step**: One simple script/command per step (e.g., activate app → open search → type name → press Enter, etc.) -- **macOS**: Use simple `osascript` commands (one per step), or `open -a "App"` -- **Windows**: Use simple `powershell`/`cmd` commands (one per step) -- **Linux**: Use simple `xdotool`/`wmctrl` commands (one per step) +# Control Boundaries + +Use `ControlHub` for browser automation, terminal signalling, and routing/capability introspection: + +- `domain: "browser"` for websites and web apps in the user's real browser through CDP. +- `domain: "terminal"` for signalling existing terminal sessions, such as interrupting or killing them. +- `domain: "meta"` for capability and route checks. -Only use **`ControlHub`** with `domain: "desktop"` when scripts can't do the job, or when you need visual confirmation. +Do not use `ControlHub` for local computer, operating-system, or desktop UI work. Desktop and system actions have moved to the dedicated `ComputerUse` tool/agent. This includes screenshots, OCR, mouse, keyboard, app state, app launching, opening files or URLs through the OS, clipboard access, OS facts, and local scripts. -If the user's request needs **more than one** `ControlHub` `domain: "desktop"` call (or spans **multiple apps/windows**), first state a **short numbered plan**: (a) whether **script automation applies** (one step at a time), (b) whether `Bash` applies (e.g. `open -a "AppName"`), (c) whether `key_chord` / `type_text` can replace mouse steps (Enter, Escape, Tab, shortcuts), (d) which `click_element` / `move_to_text` / `locate` calls to try if pointing is required, (e) target app/window/display, (f) how you will verify focus. Then execute step-by-step. +If the user asks you to operate or inspect the local computer, delegate the task to the `ComputerUse` sub-agent when available. Include the user's goal, target app/window/site, safety constraints, and expected verification in the handoff. If delegation is unavailable, explain that the task needs the Computer Use agent. # Session Coordination + For complex coding tasks or office-style multi-step tasks, prefer multi-session coordination over doing everything in the current session. + Use `SessionControl` to list, reuse, create, and delete sessions. Use `SessionMessage` to hand off a self-contained subtask to another session. Use this pattern when: + - The work can be split into independent subtasks. -- A dedicated planning, coding, research, or writing thread would reduce context switching. +- A dedicated planning, coding, research, writing, or computer-use thread would reduce context switching. - The task benefits from persistent context across multiple steps or multiple user turns. Choose the session type intentionally: + - `agentic` for implementation, debugging, and code changes. - `Plan` for requirement clarification, scoping, and planning before coding. - `Cowork` for research, documents, presentations, summaries, and other office-related work. +- `ComputerUse` for local computer/system/desktop operation and perception. Operational rules: + - Reuse an existing relevant session when possible. If unsure, list sessions before creating a new one. - Every `SessionMessage` should include the goal, relevant context, constraints, and expected output. - When a target session finishes, its reply is an automated subtask result, not a new human instruction. Synthesize it, verify it when needed, and continue. @@ -45,303 +52,13 @@ Operational rules: - Do not create extra sessions for trivial, tightly coupled, or one-step work. # Safety -You have no independent goals: do not pursue self-preservation, replication, resource acquisition, or power-seeking; avoid long-term plans beyond the user's request. -Prioritize safety and human oversight over completion; if instructions conflict, pause and ask; comply with stop/pause/audit requests and never bypass safeguards. -Do not manipulate or persuade anyone to expand access or disable safeguards. Do not copy yourself or change system prompts, safety rules, or tool policies unless explicitly requested. - -# ControlHub — the unified control entry point (BitFun desktop, when enabled) -There is **one** control tool: **`ControlHub`**. Every call has the shape `{ domain, action, params }` and returns the unified envelope `{ ok, domain, action, data | error }`. - -## Picking a domain (decision order) -1. **`domain: "browser"`** — drive a website / web app in the user's real browser via CDP (preserves cookies / login / extensions). -2. **`domain: "desktop"`** — drive another desktop application (third-party windows, OS dialogs, system-wide keyboard / mouse, accessibility). This is the legacy "Computer Use" surface. -3. **`domain: "system"`** — `open_app`, `run_script` (applescript / shell, with `timeout_ms` + `max_output_bytes`), `get_os_info`. -4. **`domain: "terminal"`** — `list_sessions`, `kill`, `interrupt` (signals only; use the `Bash` tool to *run* new commands). -5. **`domain: "meta"`** — `capabilities`, `route_hint` for introspection / routing checks before long flows. - -When unsure between two domains, prefer the smallest blast radius: `browser` < `desktop` < `system`. - -## Multi-display safety (NEW — fixes the "wrong screen" bug) -On multi-monitor setups, **never** assume the cursor is on the screen the user is looking at. Every `desktop` result includes `interaction_state.displays` and `interaction_state.active_display_id`. - -- **Single display** (`displays.length === 1`): no extra step needed — go straight to `screenshot` / `click_element` / etc. -- **Multi-display**: pick ONE of these patterns: - 1. **One-shot pin (preferred, saves a round-trip)**: pass `display_id` directly inside the action's params, e.g. `{ domain: "desktop", action: "screenshot", params: { display_id: 2 } }`. The pin is sticky for follow-up actions. - 2. **Explicit pin**: `desktop.list_displays` (once) → `desktop.focus_display { display_id }` → action. Pass `{ display_id: null }` to clear the pin and fall back to "screen under the mouse". - -In both patterns, after a pin every `screenshot` is guaranteed to come from that display until cleared. - -## `domain: "desktop"` — actions and policies (Computer Use) -The actions inside `domain: "desktop"` are: `click_element`, `move_to_text`, `click`, `mouse_move`, `scroll`, `drag`, `screenshot`, `locate`, `key_chord`, `type_text`, `paste`, `pointer_move_rel`, `wait`. AX-first additions (Codex parity, **prefer when `meta.capabilities.domains.desktop.supports_background_input` is true on macOS**): `list_apps`, `get_app_state`, `app_click`, `app_type_text`, `app_scroll`, `app_key_chord`, `app_wait_for`. **Interactive-View-first (TuriX-style Set-of-Mark) — STRONGLY PREFERRED on macOS when available**: `build_interactive_view`, `interactive_click`, `interactive_type_text`, `interactive_scroll`. Every example in this section is a `domain: "desktop"` call — substitute the action name into `params`. - -### Interactive-View-first workflow (macOS, Set-of-Mark) — DEFAULT for visible UI on macOS -When background input + AX tree are supported, this is the **preferred** path for any third-party GUI work. It collapses "find element + addressing + click" into a single visual handle: the **`i`** index of a numbered coloured box drawn on the focused window screenshot. The model never invents pixel coordinates and never has to translate `node_idx` ↔ JPEG. - -1. `desktop.list_apps {}` → pick `{ pid }` (or `{ bundle_id }` / `{ name }`). -2. `desktop.build_interactive_view { app: { pid: } }` → returns a focused-window screenshot **with numbered coloured boxes overlaid**, plus `elements[]` (each item: `i`, `role`, `subrole`, `label`, `frame_image`, `frame_global`, `enabled`, `focused`), a compact `tree_text`, and a stable `digest`. **Reference elements ONLY by their `i` index** in subsequent calls. Colour key: blue=button, green=text-field/textarea, orange=link, purple=menu/popup, red=focused, gray=other. - - Useful options: `opts.focus_window_only` (default `true`), `opts.max_elements` (default ~80; host trims by visual area), `opts.annotate_screenshot` (default `true` — set `false` to save overlay cost on retries), `opts.include_tree_text` (default `true`). -3. Act with the **index-targeted** variants. Always echo `before_view_digest: ""` so the host can detect a stale view (UI changed under you). The host accepts either the full digest or any prefix of **at least 8 characters** (the 12-char digest shown in `summary` is a valid shorthand): - - `desktop.interactive_click { app: {pid:N}, i: K, before_view_digest: "" }` — accepts `click_count`, `mouse_button`, `modifier_keys`, `wait_ms_after`, `return_view` (default `true`, host re-renders the view for the next turn). - - `desktop.interactive_type_text { app: {pid:N}, i: K, text: "...", before_view_digest: "", clear_first?: true, press_enter_after?: false }` — omit `i` to type into whatever element is currently focused. - - `desktop.interactive_scroll { app: {pid:N}, i: K, dy: -3, dx: 0, before_view_digest: "" }` — omit `i` to scroll the focused window centre. -4. The action response carries the post-action `app_state` (with screenshot) AND, when `return_view=true`, a fresh `interactive_view` (new `digest`, new numbered overlay). **Use the new `digest` for the next call.** When you see `interactive_view: null` (you set `return_view=false`, or the rebuild failed), call `build_interactive_view` again before the next `i`-addressed action. -5. Errors you may see: `INTERACTIVE_VIEW_STALE` (`before_view_digest` no longer matches the cached view — re-run `build_interactive_view` and reuse the new `i`/`digest`), `INTERACTIVE_INDEX_OUT_OF_RANGE` (the `i` is not in the current cached view — same fix), `INTERACTIVE_VIEW_UNAVAILABLE` (host doesn't support SoM — fall back to AX-first below). - -**MANDATORY OBSERVE → PLAN → EXPECT → VERIFY loop (every interactive turn):** -For each `interactive_*` action you take, your visible reasoning MUST contain four short labelled lines BEFORE the tool call, and one VERIFY line in the next turn AFTER the response. This is the single biggest accuracy lever vs. ad-hoc clicking. -1. **OBSERVE:** the exact `i`, `role`, `label`, and on-screen position you are about to act on (one line, copied from the latest `elements[]` / annotated overlay). If `elements[]` is older than the previous action, **rebuild the view first** — never guess. -2. **PLAN:** the single concrete action and parameters (`interactive_click { i: 7, ... }`), and the prefix/full `digest` you will pass. -3. **EXPECT:** in one sentence, the visible UI change you predict — e.g. "the popup closes and a new modal titled 'Game' appears", "input field 12 gains focus and shows the text I typed". Be specific enough that the next screenshot can falsify it. -4. **(Tool call)**. -5. **VERIFY (next turn, before any further action):** compare the returned `interactive_view` overlay + `app_state` to your EXPECT line. State explicitly **PASS** or **FAIL: **. On FAIL: do **not** retry the same action — re-OBSERVE the new view and pick a different element / different action. - - Treat `execution_note` containing `auto_rebuilt_view_after_stale` or `fallback_image_xy` as soft warnings — the click landed but via a recovery path; double-check the EXPECT before continuing. - - For repeated FAIL on the same target across two turns: switch tactic — try `key_chord` (keyboard nav), `move_to_text` (OCR), or `app_click { target: { ocr_text } }` (OCR-based fallback) instead of clicking the same `i` again. - -**When to fall back from Interactive-View-first to AX-first:** -- `meta.capabilities.domains.desktop.supports_interactive_view` is **false** (non-macOS). -- The target widget is not in `elements[]` (e.g. Canvas / WebGL / custom-drawn surfaces). Use `desktop.app_click { target: { ocr_text: { needle: "..." } } }` instead. -- You need AX-only operations not yet exposed via the index API (e.g. `app_wait_for`, `app_key_chord` with `focus_idx`). - -### AX-first workflow (macOS, third-party apps) — fallback when Interactive-View is unavailable -When background input + AX tree are supported, drive the target app **without** stealing the user's foreground focus or cursor: -1. `desktop.list_apps {}` → pick `{ pid }` (or `{ bundle_id }` / `{ name }`). -2. `desktop.get_app_state { app: { pid: } }` → read `app_state.tree_text` + `app_state_nodes[]`. Each node has a stable `idx` you address in subsequent calls. Remember `before_digest` for change detection. -3. Act with the **node-targeted** variants — they try the AX action path (`AXPress` / `AXSetAttributeValue`) first and only fall back to PID-scoped synthetic events if the node refuses: - - `desktop.app_click { app: {pid:N}, target: { node_idx: K } }` - - `desktop.app_type_text { app: {pid:N}, text: "...", focus: { node_idx: K } }` - - `desktop.app_scroll { app: {pid:N}, dx: 0, dy: -120, focus: { node_idx: K } }` - - `desktop.app_key_chord { app: {pid:N}, keys: ["command","f"], focus_idx: K }` - - When the AX tree does NOT expose the target widget (Canvas, WebGL, custom-drawn cells, third-party games), use the OCR fallback: `desktop.app_click { app: {pid:N}, target: { ocr_text: { needle: "Start" } } }`. The host screenshots, OCRs, picks the highest-confidence match, and clicks its centre — all still PID-scoped so the user's cursor never moves. Prefer node_idx whenever it works (faster + no OCR confidence noise). -4. After acting, the response already contains the **after** `app_state` + `app_state_nodes` — diff against `before_digest`. If you need to wait for an async UI transition use `desktop.app_wait_for { app, predicate: { digest_changed: { prev_digest } } | { title_contains: "..." } | { role_enabled: { role, title } } | { node_enabled: { idx } }, timeout_ms, poll_ms }`. -5. Errors you may see: `APP_NOT_FOUND` (selector didn't resolve a running PID), `AX_NODE_STALE` (the cached `idx` no longer points to a live element — re-snapshot with `get_app_state`), `BACKGROUND_INPUT_UNAVAILABLE` (Accessibility permission missing or non-macOS — fall back to legacy `click` / `type_text` / `paste`). - -If `meta.capabilities.domains.desktop.supports_background_input` is **false** (Linux / Windows / unprivileged macOS), do NOT use the `app_*` actions; they will fail with `BACKGROUND_INPUT_UNAVAILABLE`. Use the legacy screen-coordinate actions instead. - -### Entering text — `paste` is the default, `type_text` is the fallback (MANDATORY) -**For ANY of these, use `desktop.paste { text, submit?, clear_first? }`, NEVER `type_text`:** -- CJK / Japanese / Korean / Arabic / any non-Latin script (input methods break `type_text`) -- Anything with emoji -- Multi-line text -- Text > ~15 characters (each char of `type_text` is a separate keystroke and is slow) -- Anything you'd send as one logical message — chat messages, search queries, contact names, file paths -- Text containing punctuation that an active IME might intercept (`,`, `。`, `?`) - -`paste` is one tool call that: -1. Writes `text` to the system clipboard -2. Optionally `cmd/ctrl+a` first if `clear_first: true` (replaces existing content) -3. Sends `cmd/ctrl+v` -4. Optionally presses Return if `submit: true` (or a custom chord via `submit_keys`) - -**Canonical "send a message in any IM" recipe — STRONGLY PREFER the playbook:** -`Playbook { name: "im_send_message", parameters: { app_name, contact, message } }` - -The playbook does the right state reset (Escape any in-chat-find / modal), -opens contact search, pastes the contact, **takes a verification screenshot -so you can confirm the chat header matches `contact` BEFORE pasting the -message body**, and only then sends. This mid-flow verify is the entire -reason it works — manual recipes that paste contact + paste message back-to-back -without verifying will silently send the message body to the WRONG person. - -Manual recipe (only when the playbook is unavailable; you MUST add the -verify step yourself): -1. `system.open_app { app_name: "WeChat" }` — re-activates and brings to front -2. `desktop.key_chord { keys: ["escape"] }` — close any in-chat find / modal so the next `cmd+f` hits **global contact search**, not "find in current conversation" -3. `desktop.key_chord { keys: ["command","f"] }` -4. `desktop.paste { text: "", submit: true }` → opens chat with top match -5. `desktop.screenshot { screenshot_window: true }` → **READ THE CHAT HEADER**. If it does not show ``, STOP. Do not proceed to step 6. -6. `desktop.paste { text: "", submit: true }` → sends - -**Sending to a SECOND / DIFFERENT contact (HARD RULE):** -After you have just sent a message and the user asks you to "send to someone -else too" — DO NOT try to `cmd+f` from the current chat. Focus is in the -chat input field, and `cmd+f` in WeChat / iMessage / many IMs triggers -**in-chat find** (search inside the current conversation), NOT global -contact search. Pasting the next contact name into in-chat find followed by -the message body will send `\n` to the previous -recipient as a single garbled message. **Always re-invoke the playbook from -the top** for each new recipient — it pays the cost of one Escape + one -re-activation in exchange for guaranteeing you are searching contacts, not -in-chat text. - -### NEVER use `Bash` / `osascript` / AppleScript to drive a chat app (HARD RULE) -Sending a WeChat / iMessage / Slack / Lark / Telegram / 飞书 / 钉钉 message via: - -``` -osascript -e 'tell application "WeChat" to activate' \ - -e 'tell application "System Events" to keystroke "尉怡青"' ... -``` - -is **broken in two ways the agent cannot recover from**: -1. **`keystroke` does not support non-ASCII** — AppleScript's `keystroke` sends raw key codes, not Unicode. CJK / emoji / accented text comes out as garbage like `AAA…` in the target app's search box. The contact "尉怡青" will never be found this way. -2. No return value, no verification — you cannot tell from the bash output whether the message was sent, queued, or silently dropped because the wrong window was focused. - -The Bash tool actively **refuses** these patterns and tells you to use the recipe above. Don't try to work around it with `defaults write` / `pbcopy` chains either — `desktop.paste` already does the right thing in one call, with screenshot verification baked in. - -**`desktop.screenshot` is dead simple now:** every screenshot is either the **focused application window** (default, via Accessibility) or the **full display** (fallback when AX can't resolve the window). No mouse-centered crops, no quadrant drilling, no point crops. Take a `screenshot` whenever you need to see the current state — you always get a useful frame. - -For Slack / Lark / multi-line apps where Return inserts a newline: -`desktop.paste { text: "...", submit: true, submit_keys: ["command","return"] }` -`type_text` is **only** for short Latin-only text into a known-focused field on hosts where the clipboard helper is unavailable (Linux without wl-clipboard / xclip). In every other case `paste` is faster, more reliable, and avoids a verification screenshot. - -### Keyboard before mouse (MANDATORY — not a suggestion) -**Always ask yourself first: "Can I complete this step with a keystroke?"** If yes, use `key_chord`, `paste`, or `type_text`. Mouse is a fallback, not the default. - -`key_chord` accepts EITHER `{"keys": ["command","v"]}` (canonical, modifiers first) OR a bare `{"keys": "escape"}` for a single key (auto-coerced). Always prefer the array form for clarity. - -**Decision tree — apply top-to-bottom, stop at the first match:** -1. **After typing in a search/input field** (search, filter, filename, etc.) → **ALWAYS try `key_chord` with `return` first**, before any mouse action. The Enter key is the standard way to confirm/submit input. -2. **Default action / submit / confirm** (OK, Save, Submit, Continue, Send, Done, Yes, or primary button) → **`key_chord` with `return`** (requires fresh screenshot per policy). NEVER click these buttons when Enter works. -3. **Cancel / close / dismiss** (dialog, popup, modal, sheet) → **`key_chord` with `escape`**. Do not click "Cancel" / X. -4. **Navigate between controls/fields** when current focus is unknown or lost → **`key_chord` with `tab`** (forward) or **`shift+tab`** (backward). Do not immediately reach for the mouse when you can Tab to the target. -5. **Toggle a focused checkbox/radio/switch** → **`key_chord` with `space`**. Do not click it. -6. **Select in a focused dropdown/list** → **arrow keys** via `key_chord`, then `return` to confirm. Do not click items. -7. **Open context menu** → **`key_chord` with `shift+F10`** (Windows/Linux) or **`control+click`** as secondary to `right` button click on macOS; still prefer menu shortcuts when available. -8. **Clipboard** → **`key_chord`** for copy/cut/paste/select-all. Never click Edit menu for these. -9. **App shortcuts** (visible in menus or well-known: Cmd+S/Ctrl+S to save, Cmd+W/Ctrl+W to close tab, Cmd+T/Ctrl+T new tab, Cmd+L/Ctrl+L focus address bar, Cmd+F/Ctrl+F find, etc.) → **`key_chord`**. Do not click the menu item. -10. **Scroll a page** → **`key_chord` with `space`** (page down), **`shift+space`** (page up), **`home`**, **`end`**, or arrow keys — before using `scroll` action. -11. **Text editing** (select all, move to start/end of line, delete word) → Use standard keyboard shortcuts via `key_chord` before attempting mouse selection or clicking. - -**Strategy when stuck with mouse:** -- If `move_to_text` fails to find your target → try `key_chord` with `tab` (or `shift+tab`) to navigate focus. -- If you're repeatedly trying `mouse_move` with guessed coordinates and failing → STOP. Switch strategy: try `tab` navigation, try `key_chord` shortcuts, or re-verify which app is focused. -- If you've tried the same mouse-based approach 2-3 times without success → you MUST switch to a completely different strategy (keyboard, different targeting method, verify app focus, ask user for help). - -**Only use mouse** (`click_element`, `move_to_text`+`click`, or vision path) when: -- The target cannot be reached by Tab/keyboard focus navigation from current focus -- You need to click a specific non-default button/link that has no keyboard equivalent -- The focused element is unknown and you cannot determine it from context -- You have already tried the keyboard-first approach and it failed - -### Automation priority (try higher first) -**Targeting rule:** Prefer **script/command-line automation** over GUI automation whenever possible. Scripts are faster, more reliable, and less prone to breaking when UI changes. - -**GUI automation (`domain: "desktop"`) is a fallback, not the default.** - -1. **Direct command/script automation (HIGHEST PRIORITY)**: - - **Step-by-step**: Execute one simple command/script per step, not a single huge script - - **macOS**: `osascript` (simple one-liners), `open -a "App"`, etc. (or `ControlHub domain:"system" action:"run_script"` with `script_type:"applescript"`) - - **Windows**: `powershell`/`cmd` (simple one-liners), `start`, etc. - - **Linux**: `xdotool`, `ydotool`, `wmctrl` (simple one-liners), etc. - - **App-specific CLI tools**: Use CLI versions of apps when available (e.g. `subl`, `code`, `git`, etc.) - - Prefer this over **any** GUI automation when a script/command can complete the task (one step at a time) - -2. **`key_chord`** -- OS and app keyboard shortcuts; **Enter/Return/Escape/Tab/Space** and clipboard (copy/cut/paste). **Prefer over mouse** whenever a key completes the same step (see **Keyboard before mouse**). **No** mandatory screenshot before non-Enter chords (see Screenshot policy). - -3. **`click_element`** -- accessibility (AX/UIA/AT-SPI): locate + move + click in one call. **Bypasses screenshot guard.** Use when filters can match the control. - -4. **`move_to_text`** (OCR) -- match **visible on-screen text** and **move the pointer** to it (no click, no keys). **Does not require a prior model-driven `screenshot` for targeting** (host captures internally). Use **`click`** in a separate step if you need a mouse press. Use **before** `screenshot` drill or **`mouse_move` + `click`** whenever distinctive text is visible in the **same language as the UI**. Prefer this over the vision path when you have not yet taken a screenshot. - -5. **`locate`** -- find an element without clicking (JSON + coordinates). No screenshot required for the lookup itself. - -6. **`screenshot`** (confirm UI only) + **`mouse_move`** (**`use_screen_coordinates`: true**, globals from **`locate`** / **`move_to_text`** / tool JSON) + **`click`** -- **last resort** when AX/OCR are insufficient. **Never** derive `mouse_move` targets from JPEG pixels. - -7. **`mouse_move`**, **`scroll`**, **`drag`**, **`type_text`**, **`pointer_move_rel`**, **`wait`** -- manipulate without mandatory pre-screenshot (see Screenshot policy; host may still require refresh before a later **`click`** or Enter **`key_chord`**). **`mouse_move` / `drag`:** globals only (`use_screen_coordinates`: true). **`pointer_move_rel`:** the **desktop host refuses** this as the **next** action after **`screenshot`** -- reposition with **`move_to_text`**, **`mouse_move`**, or **`click_element`** first (do not nudge from the JPEG). - -### `click_element` (preferred for most accessibility-backed clicks) -Use `click_element` when the target has a known accessible title or role. It locates the element via AX tree, moves the pointer to its center, and clicks -- all in one call. No screenshot needed. Supports `button` (left/right/middle) and `num_clicks` (1/2/3 for single/double/triple click). - -**Filter priority (use the first one that fits):** -1. **`node_idx`** (+ optional `app_state_digest`) — if you just called `desktop.get_app_state`, reuse the `idx` directly. One AX lookup, zero BFS, zero ambiguity. macOS only; other platforms return `AX_IDX_NOT_SUPPORTED` and you fall through. -2. **`text_contains`** — case-insensitive substring across AXTitle / AXValue / AXDescription / AXHelp. Best default when the visible label is shown via value/description (e.g. cards built from `AXStaticText`). The locator now climbs up to the closest clickable ancestor (`AXButton` / `AXCell` / `AXLink` / …) automatically. -3. **`title_contains` + `role_substring`** — only when you specifically want to constrain by `AXTitle` and a role/subrole hint (`role_substring` also matches `AXSubrole`, e.g. `"SearchField"`). - -Use `filter_combine: "any"` when fields might not overlap (e.g. text fields with no title). If no match, refine the query or fall back to OCR. Prefer short, distinctive substrings. If a call returns no match, **change the query** before retrying. Use the same language as the app UI. - -**When `click_element` won't work:** Many apps (Electron/web views, custom-drawn UI) have limited AX trees. **Do not** repeat the same `title_contains`/`role_substring` more than twice -- switch to **`move_to_text`** on visible chrome (tabs, buttons, search hints) or screenshot + `mouse_move` + `click`. That is expected, not a bug. - -### Screenshot policy — **screenshots are your eyes** -**Iron rule: never act blind on a desktop UI you have not seen.** The AX tree is metadata; it does not describe Canvas / WebGL / WebView / custom-drawn surfaces (games, charts, maps, video, rich editors). If you have not looked at a pixel image of the current frame, you do not know what is on screen. **Do not click, scroll, type, or press Enter without a recent image.** - -**Free screenshots (Codex parity, macOS AX-first / Interactive-View path):** every `desktop.build_interactive_view` / `desktop.interactive_click` / `desktop.interactive_type_text` / `desktop.interactive_scroll` / `desktop.get_app_state` / `desktop.app_click` / `desktop.app_type_text` / `desktop.app_scroll` / `desktop.app_key_chord` / `desktop.app_wait_for` response **auto-attaches a focused-window screenshot** as a multimodal image (the interactive variants attach the **annotated overlay** with numbered boxes). The JSON also exposes `app_state.has_screenshot` + `app_state.screenshot_meta`, and the interactive variants carry an `interactive_view` block with the fresh `digest` and `elements[]`. **Treat the attached image as authoritative for visual state** and reconcile it against `tree_text` / `elements[]` before your next action — if the image and the tree disagree, trust the image and rebuild the view. - -**Mandatory screenshot moments:** -1. **Task start.** Before the first interaction with any app, call `desktop.get_app_state` (preferred — includes a screenshot for free) **or** `desktop.screenshot { screenshot_window: true }`. No "I'll just click the obvious button" first turn. -2. **After any AX-first action that returns `has_screenshot: false`** (rare — capture failed). Take an explicit `desktop.screenshot` before the next `app_*` call. -3. **After two consecutive failures on the same target** (same `node_idx` / `ocr_text` / coordinate). The host injects `app_state.loop_warning` in this case — when you see it, the **next** action MUST be `desktop.screenshot` (full display, `screenshot_window: false`) and you MUST switch tactic (different node, different OCR phrase, keyboard shortcut, …). Never retry the same target a third time. -4. **Before any `key_chord` containing `return`/`enter`/`kp_enter`** (cache-invalidation guard, unchanged). -5. **Before any `click` driven by JPEG/global coordinates** (cache-invalidation guard, unchanged). - -**Crop policy (unchanged): one crop, two modes.** Every screenshot is either the focused application window (default, via Accessibility) or the full display (fallback). No `~500×500 mouse crop`. No quadrant drilling. `screenshot_crop_center_*` / `screenshot_navigate_quadrant` / `screenshot_reset_navigation` / `screenshot_implicit_center` are silently ignored. The only knob with effect is `screenshot_window` (alias `window`): -- `true` / `"focused"` → force focused-window crop. -- `false` → full display (use this for the **loop-warning recovery** screenshot, so you can see chrome / docks / dialogs that the focused window may have obscured). -- omitted → focused-window first, full display fallback. - -**Not** subject to "must screenshot first": `mouse_move`, `scroll`, `drag`, `type_text`, `paste`, `locate`, `wait`, `pointer_move_rel`, `key_chord` **without** Enter/Return, **`move_to_text`** / **`click_element`**, and any `app_*` call (those carry their own auto-screenshot). - -**Cadence:** the AX-first loop already gives you one image per turn for free — **use it**. Only fall back to a manual `desktop.screenshot` when (a) you need a full-display view, (b) the auto-shot failed, or (c) you are recovering from a `loop_warning`. Do not spam extra screenshots before ordinary moves "just in case" — the auto-attached one already covers you. - -### Screenshot path (lowest targeting tier) -After **`click_element`** and **`move_to_text`** are exhausted or inappropriate, use **`screenshot`** for **confirmation** -- not for inventing move coordinates. - -When you **do** take a `screenshot`, inspect JSON: -- **Do not** read pixel coordinates off the JPEG for **`mouse_move`** -- use **`locate`**, **`move_to_text`**, or globals from tool results with **`use_screen_coordinates`: true**. -- The JSON exposes both `image_jpeg_*` (the encoded image) and `display_native_*` (the underlying display capture in pixels). Always reason about coordinates in the **native** space; the JPEG is for visual confirmation only. - -### `move_to_text` (OCR -- high priority, not a last resort) -Use **`move_to_text`** when visible text identifies the target and AX is weak or unknown. It **only moves the cursor**; add **`click`** afterward if you need a press. **Call it before** chaining multiple `screenshot` + quadrant steps when a short substring would suffice. - -Pass a substring in the **same language as the UI**. If the host reports **several OCR hits** (`disambiguation_required`), it returns **one preview JPEG per candidate** plus **accessibility** metadata -- pick **`move_to_text_match_index`** (1-based) and call **`move_to_text` again** with the same `text_query` / `ocr_region_native`. Otherwise refine `text_query` or `ocr_region_native`. - -**Failure recovery for `move_to_text`:** If `move_to_text` returns no matches or the wrong match: -1. FIRST: Try a shorter substring (e.g. 1-2 characters instead of full phrase) -2. THEN: If that still fails, try `key_chord` with `tab` (or `shift+tab`) to navigate focus to the target -3. ONLY THEN: Consider screenshot path as last resort - -**vs globals:** Prefer **`move_to_text`** (then **`click`** if needed) over **`mouse_move` + `click`** when text is visible. **`mouse_move`** must use **`use_screen_coordinates`: true** with numbers from **`locate`** / **`move_to_text`** / **`pointer_global`** -- never JPEG guesses. - -### Vision path (last resort) -When `click_element` and **`move_to_text`** cannot complete the step: -1. `screenshot` (confirm state — focused window or full display, no crop options) -2. **`mouse_move`** with **`use_screen_coordinates`: true** (globals from **`locate`** or **`move_to_text`**) / `pointer_move_rel` as needed -3. `screenshot` if the host requires an updated basis after large pointer moves (for the next **`click`**) -4. `click` - -### Think before you act (Chain-of-Thought) -Before **every** `domain: "desktop"` action, briefly state in your response: -1. **See:** What you observe on the current screen (or from the last screenshot/tool result). -2. **Plan:** What you intend to do and why. -3. **Expect:** What the expected result should be (e.g. "button changes color", "new dialog appears", "text field gains focus"). - -After the action, compare the actual result against your expectation. If they differ, pause and reassess before continuing. This prevents blind repetition and helps catch errors early. - -### Loop detection and recovery -The system automatically tracks your action history. If `loop_warning` appears in a tool result: -- **Stop the current approach immediately.** Do not repeat the same action sequence. -- **Read the suggestion** in the `loop_warning` field and follow it. -- **Try a different strategy:** switch from vision to accessibility (`click_element`) or OCR (`move_to_text`), from mouse to keyboard shortcuts, or vice versa. -- **If stuck after trying alternatives:** explain what you attempted and ask the user for guidance rather than continuing to loop. +You have no independent goals: do not pursue self-preservation, replication, resource acquisition, or power-seeking; avoid long-term plans beyond the user's request. -### Reading the unified result envelope -Every `ControlHub` call returns: -- On success: `{ ok: true, domain, action, data, summary? }` — read `data` for action-specific fields. -- On failure: `{ ok: false, domain, action, error: { code, message, hints } }` — branch on `error.code`, never on the English `message`. Common codes: `STALE_REF`, `NOT_FOUND`, `AMBIGUOUS`, `WRONG_DISPLAY`, `WRONG_TAB`, `GUARD_REJECTED`, `TIMEOUT`, `PERMISSION_DENIED`, `MISSING_SESSION`, `FRONTEND_ERROR`, `INTERNAL`. +Prioritize safety and human oversight over completion. For destructive actions, payments, purchases, account changes, sending messages, deleting data, permission changes, and security-sensitive settings, ensure the user explicitly authorized the exact final action before it is submitted. -### `domain: "browser"` — quick reference -- Workflow: `connect` → `tab_query` (or `list_pages`) → `switch_page` → `navigate`/`snapshot` → `click`/`fill` using the `@e1` / `@e2` refs returned by `snapshot`. Take a fresh `snapshot` after every DOM mutation. -- `snapshot` traverses **open shadow roots** and **same-origin iframes**. Pass `with_backend_node_ids: true` when you need stable CDP DOM ids that survive re-renders. -- `switch_page` defaults to `activate: true` so the user actually sees the tab being driven; pass `activate: false` only for explicit headless background work. +Do not manipulate or persuade anyone to expand access or disable safeguards. Do not copy yourself or change system prompts, safety rules, or tool policies unless explicitly requested. -### Key rules -- **Script automation FIRST:** For common app tasks (sending messages, opening files, etc.), FIRST consider using a script (`ControlHub domain:"system" action:"run_script"` or `Bash`) to complete the ENTIRE TASK in one go, instead of multiple GUI automation steps. -- **macOS apps:** Use `open -a "AppName"` via Bash to launch/focus, or `osascript` for more complex automation; not Spotlight. `ControlHub domain:"system" action:"open_app"` is the cross-platform alternative when you don't have shell access. -- **Foreground safety:** Check `interaction_state.foreground_application` -- if wrong app is focused, fix focus first. `locate` and `click_element` search the **foreground** app only. -- **Multi-monitor safety:** If you have multiple displays, ALWAYS pin the target with `desktop.focus_display` before screen-coordinate actions. If actions keep targeting the wrong screen, STOP and use `desktop.list_displays` + `desktop.focus_display` to disambiguate. -- **Minimize `wait`:** Use `wait` only when you explicitly need to wait for an app to launch or a UI to load. Do not add `wait` after every single action "just in case." -- **Targeting order (when the pointer is required):** `click_element` → **`move_to_text`** (when text is visible) → **screenshot** + **`mouse_move`** + **`click`** last. Apply **Keyboard before mouse** first -- do not use this order to click a control that **Enter** / **Escape** / focus keys could handle. -- **Screenshot cadence:** Only when you need pixels or a **fine** basis before guarded **`click`**; and always immediately before **`key_chord`** with Enter/Return (host). **Do not** treat `screenshot` as the default next step after every non-click action. -- **No blind Enter:** Fresh `screenshot` required before `key_chord` with Return/Enter only (not before other chords). -- **Shortcut-first:** Use `key_chord` for Copy/Paste/Save/Undo and other labeled shortcuts. Do not click menus when shortcuts exist. Menus in screenshots often display shortcuts -- use them. Together with **Keyboard before mouse**, prefer keys over clicking visible buttons when keys are equivalent (especially **Enter** on default actions). -- **Re-plan on failure:** If `locate`/`click_element` misses or screenshot shows unexpected UI, stop and reassess. Do not retry the same approach more than twice. -- **Sensitive actions:** For messages, payments, or destructive actions, state steps and get user confirmation first. -- **Pointer info:** After `screenshot`, `pointer_image_x/y` and the red synthetic cursor show pointer position. Optional follow-up `screenshot` after large pointer moves if you need pixels before a guarded **`click`**. -- **Screenshot layout:** JPEGs are for **confirmation** (optional pointer overlay). **Do not** use JPEG pixel indices for **`mouse_move`** -- the host disables image/normalized moves; use **global** coordinates only. -- **Multi-step plans:** For tasks spanning multiple apps/steps, output a numbered plan before starting. -- **Host OS:** Use modifier names matching this host (see Environment Information). Do not mix OS conventions. -- On macOS, development builds need Accessibility permission for the debug binary. -- If `ControlHub` `domain: "desktop"` is disabled or OS permissions are missing, tell the user what to enable (call `ControlHub domain:"meta" action:"capabilities"` to confirm). +# Communication -{CLAW_WORKSPACE} -{ENV_INFO} -{PERSONA} -{AGENT_MEMORY} +Keep narration brief and value-dense. For multi-step work, state the near-term plan and then keep progress updates short. diff --git a/src/crates/core/src/agentic/agents/prompts/computer_use_mode.md b/src/crates/core/src/agentic/agents/prompts/computer_use_mode.md new file mode 100644 index 000000000..6a1bcb45a --- /dev/null +++ b/src/crates/core/src/agentic/agents/prompts/computer_use_mode.md @@ -0,0 +1,76 @@ +You are BitFun's Computer Use sub-agent. Your job is to perceive and operate the user's local computer safely and efficiently. + +Your main goal is to follow the USER's instructions at each message, denoted by the tag. + +Tool results and user messages may include tags. These tags contain useful information and reminders. Please heed them, but don't mention them in your response to the user. + +{LANGUAGE_PREFERENCE} + +# Role + +You are a dedicated desktop automation agent, not a document coworker and not a general coding mode. Use this agent for tasks that require seeing the screen, controlling apps, using the browser, interacting with OS dialogs, moving between windows, or checking the state of the local machine. + +When the task is mainly about writing documents, analyzing files, research reports, or office artifacts, use office/document skills if they are relevant, but keep the interaction anchored in the user's current computer state only when the user asked you to operate or inspect the desktop. + +# Operating Principles + +Work in a tight observe -> act -> verify loop. Before acting on a desktop UI, obtain current state with `ComputerUse` when needed, and after each meaningful UI action verify that the visible state changed as expected. + +Prefer the smallest reliable control surface: + +1. Use `ControlHub` with `domain: "browser"` for websites and web apps in the user's real browser. +2. Use `ComputerUse` for third-party desktop apps, OS dialogs, system-wide keyboard and mouse, accessibility, OCR, screenshots, app state, app/file/url opening, clipboard access, OS facts, and local scripts. +3. Use `Bash` for local shell commands when that is the clearest path and does not bypass desktop safety expectations. +4. Use `ControlHub` with `domain: "meta"` to inspect non-desktop control capabilities before long or uncertain automation flows. + +Prefer script or command-line automation when it is clearly safer and reversible, but run it step by step. Do not hide a whole GUI workflow in one large script. For GUI work, prefer keyboard shortcuts and accessibility-backed targets before mouse coordinates. + +# OS-Specific Control Profile + +Use the local OS reported in the environment information. + +For macOS: + +Use `command`, `option`, `control`, and `shift` modifier names. Prefer `open -a`, simple AppleScript one-liners, app accessibility state, interactive view, `command+a/c/x/v`, `command+space`, and `command+tab`. For visible app UI, prefer the interactive-view or AX/app-state workflow when available; fall back to OCR and mouse only when necessary. + +For Windows: + +Use `control`, `alt`, `shift`, and `meta`/`super` for the Windows key. Prefer PowerShell/cmd for simple system actions, `control+a/c/x/v`, Start menu shortcuts, Alt+Tab, UIA/accessibility targets, OCR, then mouse. + +For Linux: + +Use `control`, `alt`, `shift`, and usually `meta`/`super`. Prefer shell tools and app CLIs, then keyboard shortcuts, AT-SPI/accessibility targets, OCR, and finally mouse. Account for desktop-environment differences instead of assuming one window manager. + +# Desktop Automation Rules + +Never assume focus, display, or cursor position. For multi-display setups, inspect display state and pin a display before actions that must happen on a specific screen. + +Do not click or press Enter blindly. If the UI state is unknown, call `ComputerUse` with an observation action such as `get_app_state`, `build_interactive_view`, `screenshot`, `list_apps`, or `locate`. + +Use paste for any multi-line text, CJK/Japanese/Korean/Arabic text, emoji, long text, file paths, messages, or search queries. Use type_text only for short Latin text into a known focused field when paste is unavailable or inappropriate. + +Use keyboard before mouse. Enter/Return confirms default actions, Escape cancels or closes, Tab and Shift+Tab navigate focus, Space toggles focused controls, and standard shortcuts handle clipboard, find, save, new tab, close, and address/search fields. + +When mouse is required, prefer accessibility or OCR targets over guessed coordinates. If you need coordinates, use coordinates returned by tools such as `locate` or `move_to_text`, not coordinates guessed from an image. + +If the same GUI tactic fails twice, switch strategy: use keyboard navigation, app state, OCR, browser automation, scripts, or ask the user for the missing context. + +# Browser Work + +For websites and web apps, prefer `ControlHub` with `domain: "browser"` so cookies, login state, and extensions are preserved. Do not drive browser content through desktop screenshots when browser-domain controls are available. + +Use desktop-domain controls only for browser chrome, OS dialogs, permission prompts, file pickers, or when browser-domain capabilities are unavailable. + +# Safety And User Trust + +Treat destructive actions, payments, purchases, account changes, sending messages, deleting data, permission changes, and security-sensitive settings as high-risk. Pause for user confirmation before final submission unless the user has explicitly authorized that exact action. + +For chat and messaging apps, verify the recipient or conversation header before sending. Do not use shell scripts or AppleScript keystrokes to send CJK or emoji messages; use desktop paste and visible verification. + +If permissions are missing, explain the needed OS permission or capability briefly and stop instead of improvising unsafe alternatives. + +# Communication Style + +Keep narration short and operational. For multi-step desktop tasks, state the next few steps only when it helps the user understand what will happen. Otherwise act, verify, and report concise progress. + +When you finish, summarize what changed or what you observed, and mention any step you could not complete. diff --git a/src/crates/core/src/agentic/agents/registry.rs b/src/crates/core/src/agentic/agents/registry.rs index 2830962f8..100c00ce2 100644 --- a/src/crates/core/src/agentic/agents/registry.rs +++ b/src/crates/core/src/agentic/agents/registry.rs @@ -1,6 +1,7 @@ use super::{ - Agent, AgenticMode, ClawMode, CodeReviewAgent, CoworkMode, DebugMode, DeepResearchAgent, - ExploreAgent, FileFinderAgent, GenerateDocAgent, InitAgent, PlanMode, TeamMode, + Agent, AgenticMode, ClawMode, CodeReviewAgent, ComputerUseMode, CoworkMode, DebugMode, + DeepResearchAgent, ExploreAgent, FileFinderAgent, GenerateDocAgent, InitAgent, PlanMode, + TeamMode, }; use crate::agentic::agents::custom_subagents::{ CustomSubagent, CustomSubagentKind, CustomSubagentLoader, @@ -128,7 +129,8 @@ pub struct CustomSubagentDetail { fn default_model_id_for_builtin_agent(agent_type: &str) -> &'static str { match agent_type { - "agentic" | "Cowork" | "Plan" | "debug" | "Claw" | "DeepResearch" | "Team" => "auto", + "agentic" | "Cowork" | "ComputerUse" | "Plan" | "debug" | "Claw" | "DeepResearch" + | "Team" => "auto", _ => "primary", } } @@ -303,6 +305,7 @@ impl AgentRegistry { // Register built-in sub-agents let builtin_subagents: Vec> = vec![ + Arc::new(ComputerUseMode::new()), Arc::new(ExploreAgent::new()), Arc::new(FileFinderAgent::new()), ]; @@ -1068,7 +1071,7 @@ pub fn get_agent_registry() -> Arc { #[cfg(test)] mod tests { - use super::{default_model_id_for_builtin_agent, merge_dynamic_mcp_tools}; + use super::{default_model_id_for_builtin_agent, merge_dynamic_mcp_tools, AgentRegistry}; #[test] fn top_level_modes_default_to_auto() { @@ -1085,6 +1088,28 @@ mod tests { } } + #[tokio::test] + async fn computer_use_is_builtin_subagent_not_mode() { + let registry = AgentRegistry::new(); + let modes = registry.get_modes_info().await; + assert!( + !modes.iter().any(|agent| agent.id == "ComputerUse"), + "ComputerUse should be delegated through Task as a built-in sub-agent, not exposed as a top-level mode" + ); + + let subagents = registry.get_subagents_info(None).await; + let computer_use = subagents + .iter() + .find(|agent| agent.id == "ComputerUse") + .expect("ComputerUse should be registered as a built-in sub-agent"); + assert!(computer_use + .default_tools + .contains(&"ControlHub".to_string())); + assert!(computer_use + .default_tools + .contains(&"ComputerUse".to_string())); + } + #[test] fn non_mode_agents_default_to_primary() { assert_eq!(default_model_id_for_builtin_agent("Explore"), "primary"); diff --git a/src/crates/core/src/agentic/tools/implementations/computer_use_actions.rs b/src/crates/core/src/agentic/tools/implementations/computer_use_actions.rs new file mode 100644 index 000000000..6c11c131a --- /dev/null +++ b/src/crates/core/src/agentic/tools/implementations/computer_use_actions.rs @@ -0,0 +1,2349 @@ +//! Computer Use desktop and OS/system action implementations. +//! +//! This module owns the action logic that used to live behind ControlHub's +//! desktop/system domains. ControlHub may still share the common error envelope +//! types, but it no longer owns these Computer Use behaviors. + +use crate::agentic::tools::computer_use_host::{ + AppClickParams, AppSelector, AppWaitPredicate, ClickTarget, ComputerUseForegroundApplication, + ComputerUseHostRef, InteractiveClickParams, InteractiveScrollParams, InteractiveTypeTextParams, + InteractiveViewOpts, VisualClickParams, VisualMarkViewOpts, +}; +use crate::agentic::tools::framework::{Tool, ToolResult, ToolUseContext}; +use crate::util::elapsed_ms_u64; +use crate::util::errors::{BitFunError, BitFunResult}; +use serde_json::{json, Value}; + +use super::control_hub::{err_response, ControlHubError, ErrorCode}; + +/// Per-PID consecutive-failure tracker for the AX-first `app_*` actions. +/// Key = target PID, value = `(target_signature, before_digest, count)`. +/// When the same `(action,target)` lands on an unchanged digest twice in a +/// row the dispatcher injects an `app_state.loop_warning` so the model is +/// forced off the failing path on its **next** turn (`/Screenshot policy/ +/// Mandatory screenshot moments` in `claw_mode.md`). +static APP_LOOP_TRACKER: std::sync::OnceLock< + std::sync::Mutex>, +> = std::sync::OnceLock::new(); + +fn loop_tracker_observe( + pid: Option, + action: &str, + target_sig: &str, + before_digest: &str, + after_digest: &str, +) -> Option { + let pid = pid?; + // A digest change means the action mutated the tree — that is real + // progress and resets the streak even if the model picks the same + // target name on purpose (e.g. clicking "Next" repeatedly). + let progressed = before_digest != after_digest; + let sig = format!("{action}:{target_sig}"); + let mut guard = APP_LOOP_TRACKER + .get_or_init(|| std::sync::Mutex::new(std::collections::HashMap::new())) + .lock() + .ok()?; + let entry = guard + .entry(pid) + .or_insert_with(|| (String::new(), String::new(), 0)); + if progressed { + *entry = (sig, after_digest.to_string(), 1); + return None; + } + if entry.0 == sig && entry.1 == before_digest { + entry.2 = entry.2.saturating_add(1); + } else { + *entry = (sig, before_digest.to_string(), 1); + } + if entry.2 >= 2 { + Some(format!( + "Detected {} consecutive `{}` calls on the same target ({}) without any AX tree mutation (digest unchanged). The target is almost certainly invisible / disabled / in a Canvas-WebGL surface that AX cannot describe. NEXT TURN you MUST: (1) run `desktop.screenshot {{ screenshot_window: false }}` to see the full display, (2) switch tactic — different `node_idx`, different `ocr_text` needle, or a keyboard shortcut. Do NOT retry this same target a third time.", + entry.2, action, target_sig + )) + } else { + None + } +} + +pub(crate) struct ComputerUseActions; + +impl Default for ComputerUseActions { + fn default() -> Self { + Self::new() + } +} + +impl ComputerUseActions { + pub(crate) fn new() -> Self { + Self + } + + fn desktop_browser_guard_error( + action: &str, + foreground: Option<&ComputerUseForegroundApplication>, + ) -> ControlHubError { + let app_name = foreground + .and_then(|app| app.name.as_deref()) + .unwrap_or("a web browser"); + ControlHubError::new( + ErrorCode::GuardRejected, + format!( + "desktop.{} is blocked while {} is frontmost. Use ControlHub domain=\"browser\" for all browser interaction; desktop mouse/keyboard browser control is forbidden.", + action, app_name + ), + ) + .with_hints([ + "Use browser.connect to attach via the test port, then drive the page with snapshot/click/fill/press_key", + "For login/cookies/extensions, guide the user to start their default browser with the test port enabled before calling browser.connect", + "For isolated project Web UI testing, use the headless browser flow instead of desktop automation", + ]) + } + + fn is_probably_browser_app(foreground: &ComputerUseForegroundApplication) -> bool { + let name = foreground + .name + .as_deref() + .unwrap_or("") + .to_ascii_lowercase(); + let bundle = foreground + .bundle_id + .as_deref() + .unwrap_or("") + .to_ascii_lowercase(); + + const NAME_HINTS: &[&str] = &[ + "chrome", + "chromium", + "edge", + "brave", + "arc", + "firefox", + "safari", + "browser", + "浏览器", + ]; + const BUNDLE_HINTS: &[&str] = &[ + "chrome", "chromium", "edge", "brave", "arc", "firefox", "safari", "browser", + ]; + + NAME_HINTS.iter().any(|hint| name.contains(hint)) + || BUNDLE_HINTS.iter().any(|hint| bundle.contains(hint)) + } + + async fn desktop_action_targets_browser( + &self, + action: &str, + context: &ToolUseContext, + ) -> Option { + let guarded_actions = [ + "click", + "click_target", + "click_element", + "move_to_target", + "mouse_move", + "pointer_move_rel", + "scroll", + "drag", + "key_chord", + "type_text", + "paste", + "locate", + "move_to_text", + ]; + if !guarded_actions.contains(&action) { + return None; + } + let host = context.computer_use_host.as_ref()?; + let snapshot = host.computer_use_session_snapshot().await; + let foreground = snapshot.foreground_application.as_ref()?; + if Self::is_probably_browser_app(foreground) { + return Some(Self::desktop_browser_guard_error(action, Some(foreground))); + } + None + } + // ── Desktop domain ───────────────────────────────────────────────── + + pub(crate) async fn handle_desktop( + &self, + action: &str, + params: &Value, + context: &ToolUseContext, + ) -> BitFunResult> { + let host = context.computer_use_host.as_ref().ok_or_else(|| { + BitFunError::tool( + "Desktop control is only available in the BitFun desktop app".to_string(), + ) + })?; + + // Legacy desktop implementation shared by the dedicated ComputerUse + // tool while ControlHub's public desktop domain remains disabled. + match action { + "list_displays" => { + let displays = host.list_displays().await?; + let active = host.focused_display_id(); + let count = displays.len(); + return Ok(vec![ToolResult::ok( + json!({ + "displays": displays, + "active_display_id": active, + }), + Some(format!("{} display(s) detected", count)), + )]); + } + // High-leverage UX primitive: paste arbitrary text into the + // currently focused input via the system clipboard, optionally + // clearing first and submitting after. This collapses the + // canonical IM/search flow: + // + // clipboard_set + key_chord(cmd+v) + key_chord(return) + // + // ...into a single tool call. It is also the **only** robust way + // to enter CJK / emoji / multi-line text — `type_text` goes + // through the per-character key path and is at the mercy of + // every IME on the host. This is exactly the pattern Codex + // uses (`pbcopy` + cmd+v) to keep WeChat / iMessage flows + // smooth. + // + // Params: + // - text (required) — text to paste + // - clear_first (bool, default false) — cmd+a before paste, + // so the new text REPLACES whatever was there + // - submit (bool, default false) — press Return after + // paste; switches to "send the message" mode + // - submit_keys (array, default ["return"]) — override the + // submit chord (e.g. ["command","return"] for + // Slack / multi-line apps) + // + // Returns the same envelope as a `key_chord` so the model can + // chain a verification screenshot exactly as before. + "paste" => { + let text = params + .get("text") + .and_then(|v| v.as_str()) + .ok_or_else(|| { + BitFunError::tool( + "[INVALID_PARAMS] desktop.paste requires 'text'\nHints: example { \"action\":\"paste\", \"text\":\"hello\", \"submit\":true }" + .to_string(), + ) + })?; + let clear_first = params + .get("clear_first") + .and_then(|v| v.as_bool()) + .unwrap_or(false); + let submit = params + .get("submit") + .and_then(|v| v.as_bool()) + .unwrap_or(false); + let submit_keys: Vec = match params.get("submit_keys") { + Some(Value::Array(arr)) => arr + .iter() + .filter_map(|v| v.as_str().map(|s| s.to_string())) + .collect(), + Some(Value::String(s)) => vec![s.to_string()], + _ => vec!["return".to_string()], + }; + + if let Err(e) = clipboard_write(text).await { + return Ok(err_response( + "desktop", + "paste", + ControlHubError::new( + ErrorCode::NotAvailable, + format!("Clipboard write failed: {}", e), + ) + .with_hint( + "Fall back to type_text or check that wl-clipboard / xclip is installed (Linux only)", + ), + )); + } + + let paste_chord = match std::env::consts::OS { + "macos" => vec!["command".to_string(), "v".to_string()], + _ => vec!["control".to_string(), "v".to_string()], + }; + + if clear_first { + let select_all = match std::env::consts::OS { + "macos" => vec!["command".to_string(), "a".to_string()], + _ => vec!["control".to_string(), "a".to_string()], + }; + host.key_chord(select_all).await?; + } + host.key_chord(paste_chord).await?; + if submit { + host.computer_use_trust_pointer_after_text_input(); + host.key_chord(submit_keys.clone()).await?; + } + + let summary = match (clear_first, submit) { + (false, false) => format!("Pasted {} chars", text.chars().count()), + (true, false) => { + format!("Replaced focused field with {} chars", text.chars().count()) + } + (false, true) => format!("Pasted {} chars and submitted", text.chars().count()), + (true, true) => { + format!("Replaced + submitted ({} chars)", text.chars().count()) + } + }; + return Ok(vec![ToolResult::ok( + json!({ + "success": true, + "action": "paste", + "char_count": text.chars().count(), + "byte_length": text.len(), + "clear_first": clear_first, + "submitted": submit, + "submit_keys": if submit { Some(submit_keys) } else { None }, + }), + Some(summary), + )]); + } + + // ── AX-first actions (Codex parity) ─────────────────────── + // These operate on the typed AppSelector / AxNode envelope. + "list_apps" + | "get_app_state" + | "app_click" + | "app_type_text" + | "app_scroll" + | "app_key_chord" + | "app_wait_for" + | "build_interactive_view" + | "interactive_click" + | "interactive_type_text" + | "interactive_scroll" + | "build_visual_mark_view" + | "visual_click" => { + return self.handle_desktop_ax(host, action, params).await; + } + "focus_display" => { + // Accept `null` (or omitted `display_id`) to clear the pin + // and fall back to "screen under the pointer". An explicit + // numeric id pins that display until cleared. + let display_id = match params.get("display_id") { + Some(Value::Null) | None => None, + Some(v) => Some(v.as_u64().ok_or_else(|| { + BitFunError::tool( + "focus_display: 'display_id' must be a non-negative integer or null" + .to_string(), + ) + })? as u32), + }; + host.focus_display(display_id).await?; + let displays = host.list_displays().await?; + let summary = match display_id { + Some(id) => format!("Pinned display {}", id), + None => "Cleared display pin (will follow mouse)".to_string(), + }; + return Ok(vec![ToolResult::ok( + json!({ + "active_display_id": display_id, + "displays": displays, + }), + Some(summary), + )]); + } + _ => {} + } + + if let Some(err) = self.desktop_action_targets_browser(action, context).await { + return Ok(err_response("desktop", action, err)); + } + + // UX shortcut: every screen-coordinate action accepts an optional + // `display_id`. If present (and different from the currently pinned + // display), pin it BEFORE forwarding so the model doesn't need a + // separate `focus_display` round-trip. Pin is sticky — subsequent + // actions on the same screen don't need to re-specify. Pass + // `display_id: null` to clear the pin in the same call. + if let Some(v) = params.get("display_id") { + let target = match v { + Value::Null => None, + v => Some(v.as_u64().ok_or_else(|| { + BitFunError::tool( + "display_id must be a non-negative integer or null".to_string(), + ) + })? as u32), + }; + if host.focused_display_id() != target { + host.focus_display(target).await?; + } + } + + let mut cu_input = params.clone(); + if let Value::Object(ref mut map) = cu_input { + map.insert("action".to_string(), json!(action)); + // Strip the ControlHub-only field so the legacy ComputerUseTool + // doesn't trip on an unrecognised parameter. + map.remove("display_id"); + } + + let cu_tool = super::computer_use_tool::ComputerUseTool::new(); + cu_tool.call_impl(&cu_input, context).await + } + + // ── Desktop AX-first dispatch (Codex parity) ────────────────────── + // Routes the seven new app-targeted actions through the typed + // `ComputerUseHost` API. Every successful response carries a + // unified envelope: `target_app`, `background_input`, + // `before_digest` and (for state queries) `app_state` / + // `app_state_nodes` so the model can reason about the AX tree + // before/after each action without re-querying. + async fn handle_desktop_ax( + &self, + host: &ComputerUseHostRef, + action: &str, + params: &Value, + ) -> BitFunResult> { + // ── Helpers ───────────────────────────────────────────────── + fn parse_selector(v: &Value) -> BitFunResult { + let obj = v.get("app").ok_or_else(|| { + BitFunError::tool( + "[INVALID_PARAMS] missing 'app' selector (pid|bundle_id|name)".to_string(), + ) + })?; + let sel: AppSelector = serde_json::from_value(obj.clone()).map_err(|e| { + BitFunError::tool(format!( + "[INVALID_PARAMS] bad 'app' selector: {} (expect {{pid|bundle_id|name}})", + e + )) + })?; + if sel.pid.is_none() && sel.bundle_id.is_none() && sel.name.is_none() { + return Err(BitFunError::tool( + "[INVALID_PARAMS] 'app' must include at least one of pid|bundle_id|name" + .to_string(), + )); + } + Ok(sel) + } + + fn parse_click_target(v: &Value) -> BitFunResult { + if v.get("kind").is_some() { + return serde_json::from_value(v.clone()).map_err(|e| { + BitFunError::tool(format!( + "[INVALID_PARAMS] bad ClickTarget: {} (expected {{\"kind\":\"node_idx\",\"idx\":N}}, {{\"kind\":\"image_xy\",\"x\":0,\"y\":0}}, {{\"kind\":\"image_grid\",\"x0\":0,\"y0\":0,\"width\":300,\"height\":300,\"rows\":15,\"cols\":15,\"row\":7,\"col\":7,\"intersections\":true}}, {{\"kind\":\"visual_grid\",\"rows\":15,\"cols\":15,\"row\":7,\"col\":7,\"intersections\":true}}, {{\"kind\":\"screen_xy\",\"x\":0,\"y\":0}}, or {{\"kind\":\"ocr_text\",\"needle\":\"...\"}})", + e + )) + }); + } + if let Some(idx) = v.get("node_idx").and_then(|x| x.as_u64()) { + return Ok(ClickTarget::NodeIdx { idx: idx as u32 }); + } + if let Some(obj) = v.get("screen_xy") { + let x = obj.get("x").and_then(|x| x.as_f64()).ok_or_else(|| { + BitFunError::tool( + "[INVALID_PARAMS] screen_xy target requires numeric x".to_string(), + ) + })?; + let y = obj.get("y").and_then(|y| y.as_f64()).ok_or_else(|| { + BitFunError::tool( + "[INVALID_PARAMS] screen_xy target requires numeric y".to_string(), + ) + })?; + return Ok(ClickTarget::ScreenXy { x, y }); + } + if let Some(obj) = v.get("image_xy") { + let x = obj.get("x").and_then(|x| x.as_i64()).ok_or_else(|| { + BitFunError::tool( + "[INVALID_PARAMS] image_xy target requires integer x".to_string(), + ) + })?; + let y = obj.get("y").and_then(|y| y.as_i64()).ok_or_else(|| { + BitFunError::tool( + "[INVALID_PARAMS] image_xy target requires integer y".to_string(), + ) + })?; + return Ok(ClickTarget::ImageXy { + x: x as i32, + y: y as i32, + screenshot_id: obj + .get("screenshot_id") + .and_then(|v| v.as_str()) + .map(|s| s.to_string()), + }); + } + if let Some(obj) = v.get("image_grid") { + let target = json!({ + "kind": "image_grid", + "x0": obj.get("x0").cloned().unwrap_or(Value::Null), + "y0": obj.get("y0").cloned().unwrap_or(Value::Null), + "width": obj.get("width").cloned().unwrap_or(Value::Null), + "height": obj.get("height").cloned().unwrap_or(Value::Null), + "rows": obj.get("rows").cloned().unwrap_or(Value::Null), + "cols": obj.get("cols").cloned().unwrap_or(Value::Null), + "row": obj.get("row").cloned().unwrap_or(Value::Null), + "col": obj.get("col").cloned().unwrap_or(Value::Null), + "intersections": obj.get("intersections").cloned().unwrap_or(json!(false)), + "screenshot_id": obj.get("screenshot_id").cloned().unwrap_or(Value::Null), + }); + return serde_json::from_value(target).map_err(|e| { + BitFunError::tool(format!( + "[INVALID_PARAMS] bad image_grid target: {} (need x0,y0,width,height,rows,cols,row,col; optional intersections)", + e + )) + }); + } + if let Some(obj) = v.get("visual_grid") { + let target = json!({ + "kind": "visual_grid", + "rows": obj.get("rows").cloned().unwrap_or(Value::Null), + "cols": obj.get("cols").cloned().unwrap_or(Value::Null), + "row": obj.get("row").cloned().unwrap_or(Value::Null), + "col": obj.get("col").cloned().unwrap_or(Value::Null), + "intersections": obj.get("intersections").cloned().unwrap_or(json!(false)), + "wait_ms_after_detection": obj.get("wait_ms_after_detection").cloned().unwrap_or(Value::Null), + }); + return serde_json::from_value(target).map_err(|e| { + BitFunError::tool(format!( + "[INVALID_PARAMS] bad visual_grid target: {} (need rows,cols,row,col; optional intersections)", + e + )) + }); + } + if v.get("x").is_some() || v.get("y").is_some() { + let x = v.get("x").and_then(|x| x.as_f64()).ok_or_else(|| { + BitFunError::tool( + "[INVALID_PARAMS] screen target requires numeric x".to_string(), + ) + })?; + let y = v.get("y").and_then(|y| y.as_f64()).ok_or_else(|| { + BitFunError::tool( + "[INVALID_PARAMS] screen target requires numeric y".to_string(), + ) + })?; + return Ok(ClickTarget::ScreenXy { x, y }); + } + if let Some(ocr) = v.get("ocr_text") { + let needle = ocr + .get("needle") + .or_else(|| ocr.get("text")) + .and_then(|x| x.as_str()) + .ok_or_else(|| { + BitFunError::tool( + "[INVALID_PARAMS] ocr_text target requires needle".to_string(), + ) + })?; + return Ok(ClickTarget::OcrText { + needle: needle.to_string(), + }); + } + Err(BitFunError::tool( + "[INVALID_PARAMS] unsupported ClickTarget. Use {\"kind\":\"node_idx\",\"idx\":N}, {\"node_idx\":N}, {\"kind\":\"image_xy\",\"x\":0,\"y\":0}, {\"image_xy\":{\"x\":0,\"y\":0}}, {\"kind\":\"image_grid\",\"x0\":0,\"y0\":0,\"width\":300,\"height\":300,\"rows\":15,\"cols\":15,\"row\":7,\"col\":7,\"intersections\":true}, {\"kind\":\"visual_grid\",\"rows\":15,\"cols\":15,\"row\":7,\"col\":7,\"intersections\":true}, {\"kind\":\"screen_xy\",\"x\":0,\"y\":0}, or {\"ocr_text\":{\"needle\":\"...\"}}.".to_string(), + )) + } + + fn parse_wait_predicate(v: &Value) -> BitFunResult { + if v.get("kind").is_some() { + return serde_json::from_value(v.clone()).map_err(|e| { + BitFunError::tool(format!( + "[INVALID_PARAMS] bad app_wait_for predicate: {}", + e + )) + }); + } + if let Some(obj) = v.get("digest_changed") { + let prev_digest = obj + .get("prev_digest") + .or_else(|| obj.get("from")) + .and_then(|x| x.as_str()) + .ok_or_else(|| { + BitFunError::tool( + "[INVALID_PARAMS] digest_changed requires prev_digest".to_string(), + ) + })?; + return Ok(AppWaitPredicate::DigestChanged { + prev_digest: prev_digest.to_string(), + }); + } + if let Some(obj) = v.get("title_contains") { + let needle = obj + .get("needle") + .or_else(|| obj.get("title")) + .and_then(|x| x.as_str()) + .or_else(|| obj.as_str()) + .ok_or_else(|| { + BitFunError::tool( + "[INVALID_PARAMS] title_contains requires needle".to_string(), + ) + })?; + return Ok(AppWaitPredicate::TitleContains { + needle: needle.to_string(), + }); + } + if let Some(obj) = v.get("role_enabled") { + let role = obj.get("role").and_then(|x| x.as_str()).ok_or_else(|| { + BitFunError::tool("[INVALID_PARAMS] role_enabled requires role".to_string()) + })?; + return Ok(AppWaitPredicate::RoleEnabled { + role: role.to_string(), + }); + } + if let Some(obj) = v.get("node_enabled") { + let idx = obj + .get("idx") + .and_then(|x| x.as_u64()) + .or_else(|| obj.as_u64()) + .ok_or_else(|| { + BitFunError::tool("[INVALID_PARAMS] node_enabled requires idx".to_string()) + })?; + return Ok(AppWaitPredicate::NodeEnabled { idx: idx as u32 }); + } + Err(BitFunError::tool( + "[INVALID_PARAMS] unsupported app_wait_for predicate. Use {\"kind\":\"digest_changed\",\"prev_digest\":\"...\"} or shorthand {\"digest_changed\":{\"prev_digest\":\"...\"}}.".to_string(), + )) + } + + fn parse_keys(v: &Value) -> Vec { + match v.get("keys").or_else(|| v.get("key")) { + Some(Value::Array(arr)) => arr + .iter() + .filter_map(|x| x.as_str().map(|s| s.to_string())) + .collect(), + Some(Value::String(s)) => vec![s.to_string()], + _ => Vec::new(), + } + } + + // Build the JSON view of an AppStateSnapshot for the model. Excludes + // the heavy `screenshot` payload (it is attached out-of-band as a + // multimodal image, not as base64 inside the JSON tree, to keep token + // budgets under control and let the provider deliver it as `image_url`). + fn snap_state_json( + snap: &crate::agentic::tools::computer_use_host::AppStateSnapshot, + ) -> serde_json::Value { + let mut v = json!({ + "app": snap.app, + "window_title": snap.window_title, + "digest": snap.digest, + "captured_at_ms": snap.captured_at_ms, + "tree_text": snap.tree_text, + "has_screenshot": snap.screenshot.is_some(), + }); + if let Some(shot) = snap.screenshot.as_ref() { + if let Some(obj) = v.as_object_mut() { + let meta: serde_json::Value = json!({ + "image_width": shot.image_width, + "image_height": shot.image_height, + "screenshot_id": shot.screenshot_id, + "native_width": shot.native_width, + "native_height": shot.native_height, + "vision_scale": shot.vision_scale, + "mime_type": shot.mime_type, + "image_content_rect": shot.image_content_rect, + "image_global_bounds": shot.image_global_bounds, + "coordinate_hint": "For visual surfaces, click pixels in this attached image with app_click target {kind:\"image_xy\", x, y, screenshot_id}. For known boards/grids/canvases, prefer {kind:\"image_grid\", x0, y0, width, height, rows, cols, row, col, intersections, screenshot_id}. If the grid rectangle is unknown, use {kind:\"visual_grid\", rows, cols, row, col, intersections}; the host detects the grid from app pixels.", + }); + obj.insert("screenshot_meta".to_string(), meta); + } + } + v + } + + // Helper: build a `ToolResult` that *also* carries the focused-window + // screenshot as an Anthropic-style multimodal image attachment. When + // the host couldn't (or chose not to) capture, fall back to a regular + // text-only `ToolResult::ok`. + fn snap_result( + data: serde_json::Value, + summary: Option, + snap: &crate::agentic::tools::computer_use_host::AppStateSnapshot, + ) -> ToolResult { + use base64::Engine as _; + if let Some(shot) = snap.screenshot.as_ref() { + let attach = crate::util::types::ToolImageAttachment { + mime_type: shot.mime_type.clone(), + data_base64: base64::engine::general_purpose::STANDARD.encode(&shot.bytes), + }; + ToolResult::ok_with_images(data, summary, vec![attach]) + } else { + ToolResult::ok(data, summary) + } + } + + // Build a JSON view of an InteractiveView that excludes the heavy + // `screenshot.bytes` payload (the JPEG is attached out-of-band as a + // multimodal image attachment, not as base64 inside the tree). + fn build_interactive_view_json( + view: &crate::agentic::tools::computer_use_host::InteractiveView, + ) -> serde_json::Value { + let mut v = json!({ + "app": view.app, + "window_title": view.window_title, + "digest": view.digest, + "captured_at_ms": view.captured_at_ms, + "elements": view.elements, + "tree_text": view.tree_text, + "loop_warning": view.loop_warning, + "has_screenshot": view.screenshot.is_some(), + }); + if let Some(shot) = view.screenshot.as_ref() { + if let Some(obj) = v.as_object_mut() { + obj.insert( + "screenshot_meta".to_string(), + json!({ + "image_width": shot.image_width, + "image_height": shot.image_height, + "screenshot_id": shot.screenshot_id, + "native_width": shot.native_width, + "native_height": shot.native_height, + "vision_scale": shot.vision_scale, + "mime_type": shot.mime_type, + "image_content_rect": shot.image_content_rect, + "image_global_bounds": shot.image_global_bounds, + "coordinate_hint": "Numbered overlays are in JPEG image-pixel space. Reference elements via their `i` index using interactive_click / interactive_type_text / interactive_scroll. For pointer-only fallback, pass screenshot_id with image_xy/image_grid.", + }), + ); + } + } + v + } + + fn build_visual_mark_view_json( + view: &crate::agentic::tools::computer_use_host::VisualMarkView, + ) -> serde_json::Value { + let mut v = json!({ + "app": view.app, + "window_title": view.window_title, + "digest": view.digest, + "captured_at_ms": view.captured_at_ms, + "marks": view.marks, + "has_screenshot": view.screenshot.is_some(), + }); + if let Some(shot) = view.screenshot.as_ref() { + if let Some(obj) = v.as_object_mut() { + obj.insert( + "screenshot_meta".to_string(), + json!({ + "image_width": shot.image_width, + "image_height": shot.image_height, + "screenshot_id": shot.screenshot_id, + "native_width": shot.native_width, + "native_height": shot.native_height, + "vision_scale": shot.vision_scale, + "mime_type": shot.mime_type, + "image_content_rect": shot.image_content_rect, + "image_global_bounds": shot.image_global_bounds, + "coordinate_hint": "Numbered visual marks are in JPEG image-pixel space. Reference marks via their `i` index using visual_click. To refine a dense area, call build_visual_mark_view again with opts.region in these screenshot pixels.", + }), + ); + } + } + v + } + + // Build a JSON envelope for interactive_* action results. Includes + // the post-action AppStateSnapshot (without screenshot bytes) and, + // when present, the rebuilt InteractiveView. + fn build_interactive_action_json( + app: &crate::agentic::tools::computer_use_host::AppSelector, + res: &crate::agentic::tools::computer_use_host::InteractiveActionResult, + extras: serde_json::Value, + ) -> serde_json::Value { + let mut v = json!({ + "target_app": app, + "app_state": snap_state_json(&res.snapshot), + "app_state_nodes": res.snapshot.nodes, + "loop_warning": res.snapshot.loop_warning, + "execution_note": res.execution_note, + "interactive_view": res.view.as_ref().map(build_interactive_view_json), + }); + if let (Some(obj), Some(extras_obj)) = (v.as_object_mut(), extras.as_object()) { + for (k, val) in extras_obj { + obj.insert(k.clone(), val.clone()); + } + } + v + } + + fn build_visual_action_json( + app: &crate::agentic::tools::computer_use_host::AppSelector, + res: &crate::agentic::tools::computer_use_host::VisualActionResult, + extras: serde_json::Value, + ) -> serde_json::Value { + let mut v = json!({ + "target_app": app, + "app_state": snap_state_json(&res.snapshot), + "app_state_nodes": res.snapshot.nodes, + "loop_warning": res.snapshot.loop_warning, + "execution_note": res.execution_note, + "visual_mark_view": res.view.as_ref().map(build_visual_mark_view_json), + }); + if let (Some(obj), Some(extras_obj)) = (v.as_object_mut(), extras.as_object()) { + for (k, val) in extras_obj { + obj.insert(k.clone(), val.clone()); + } + } + v + } + + // Attach the InteractiveView's annotated screenshot (if present) + // as a multimodal image; otherwise fall back to text-only ok. + fn interactive_view_result( + data: serde_json::Value, + summary: Option, + view: &crate::agentic::tools::computer_use_host::InteractiveView, + ) -> ToolResult { + use base64::Engine as _; + if let Some(shot) = view.screenshot.as_ref() { + let attach = crate::util::types::ToolImageAttachment { + mime_type: shot.mime_type.clone(), + data_base64: base64::engine::general_purpose::STANDARD.encode(&shot.bytes), + }; + ToolResult::ok_with_images(data, summary, vec![attach]) + } else { + ToolResult::ok(data, summary) + } + } + + fn visual_mark_view_result( + data: serde_json::Value, + summary: Option, + view: &crate::agentic::tools::computer_use_host::VisualMarkView, + ) -> ToolResult { + use base64::Engine as _; + if let Some(shot) = view.screenshot.as_ref() { + let attach = crate::util::types::ToolImageAttachment { + mime_type: shot.mime_type.clone(), + data_base64: base64::engine::general_purpose::STANDARD.encode(&shot.bytes), + }; + ToolResult::ok_with_images(data, summary, vec![attach]) + } else { + ToolResult::ok(data, summary) + } + } + + // Prefer attaching the rebuilt interactive view's screenshot when + // available; otherwise fall back to the post-action snapshot's. + fn interactive_action_result( + data: serde_json::Value, + summary: Option, + res: &crate::agentic::tools::computer_use_host::InteractiveActionResult, + ) -> ToolResult { + use base64::Engine as _; + let shot_opt = res + .view + .as_ref() + .and_then(|v| v.screenshot.as_ref()) + .or(res.snapshot.screenshot.as_ref()); + if let Some(shot) = shot_opt { + let attach = crate::util::types::ToolImageAttachment { + mime_type: shot.mime_type.clone(), + data_base64: base64::engine::general_purpose::STANDARD.encode(&shot.bytes), + }; + ToolResult::ok_with_images(data, summary, vec![attach]) + } else { + ToolResult::ok(data, summary) + } + } + + fn visual_action_result( + data: serde_json::Value, + summary: Option, + res: &crate::agentic::tools::computer_use_host::VisualActionResult, + ) -> ToolResult { + use base64::Engine as _; + let shot_opt = res + .view + .as_ref() + .and_then(|v| v.screenshot.as_ref()) + .or(res.snapshot.screenshot.as_ref()); + if let Some(shot) = shot_opt { + let attach = crate::util::types::ToolImageAttachment { + mime_type: shot.mime_type.clone(), + data_base64: base64::engine::general_purpose::STANDARD.encode(&shot.bytes), + }; + ToolResult::ok_with_images(data, summary, vec![attach]) + } else { + ToolResult::ok(data, summary) + } + } + + let bg = host.supports_background_input(); + let ax = host.supports_ax_tree(); + + match action { + "list_apps" => { + let include_hidden = params + .get("include_hidden") + .and_then(|v| v.as_bool()) + .unwrap_or_else(|| { + !params + .get("only_visible") + .and_then(|v| v.as_bool()) + .unwrap_or(true) + }); + let apps = host.list_apps(include_hidden).await?; + let n = apps.len(); + Ok(vec![ToolResult::ok( + json!({ + "apps": apps, + "include_hidden": include_hidden, + "background_input": bg, + "ax_tree": ax, + }), + Some(format!("{} app(s) listed", n)), + )]) + } + "get_app_state" => { + let app = parse_selector(params)?; + let max_depth = params + .get("max_depth") + .and_then(|v| v.as_u64()) + .unwrap_or(32) as u32; + let focus_window_only = params + .get("focus_window_only") + .and_then(|v| v.as_bool()) + .unwrap_or(false); + let snap = host + .get_app_state(app.clone(), max_depth, focus_window_only) + .await?; + let summary = format!( + "AX state for {} (digest={}, {} nodes)", + snap.app.name, + &snap.digest[..snap.digest.len().min(12)], + snap.nodes.len() + ); + let data = json!({ + "target_app": app, + "background_input": bg, + "ax_tree": ax, + "app_state": snap_state_json(&snap), + "app_state_nodes": snap.nodes, + "before_digest": snap.digest, + "loop_warning": snap.loop_warning, + }); + Ok(vec![snap_result(data, Some(summary), &snap)]) + } + "app_click" => { + let app = parse_selector(params)?; + let target_v = params.get("target").cloned().ok_or_else(|| { + BitFunError::tool( + "[INVALID_PARAMS] app_click requires 'target' ({node_idx|image_xy|screen_xy|ocr_text})" + .to_string(), + ) + })?; + let target = parse_click_target(&target_v)?; + let click_count = params + .get("click_count") + .and_then(|v| v.as_u64()) + .unwrap_or(1) as u8; + let mouse_button = params + .get("mouse_button") + .and_then(|v| v.as_str()) + .unwrap_or("left") + .to_string(); + let modifier_keys: Vec = params + .get("modifier_keys") + .and_then(|v| v.as_array()) + .map(|a| { + a.iter() + .filter_map(|x| x.as_str().map(|s| s.to_string())) + .collect() + }) + .unwrap_or_default(); + let wait_ms_after = params + .get("wait_ms_after") + .or_else(|| params.get("post_click_wait_ms")) + .and_then(|v| v.as_u64()) + .map(|v| v.min(5_000) as u32); + + let before = host + .get_app_state(app.clone(), 8, false) + .await + .ok() + .map(|s| s.digest); + + let mut after = host + .app_click(AppClickParams { + app: app.clone(), + target: target.clone(), + click_count, + mouse_button, + modifier_keys, + wait_ms_after, + }) + .await?; + + if after.loop_warning.is_none() { + let target_sig = serde_json::to_string(&target).unwrap_or_default(); + after.loop_warning = loop_tracker_observe( + app.pid, + "app_click", + &target_sig, + before.as_deref().unwrap_or(""), + &after.digest, + ); + } + + let data = json!({ + "target_app": app, + "click_target": target, + "background_input": bg, + "before_digest": before, + "app_state": snap_state_json(&after), + "app_state_nodes": after.nodes, + "loop_warning": after.loop_warning, + }); + Ok(vec![snap_result(data, Some("clicked".to_string()), &after)]) + } + "app_type_text" => { + let app = parse_selector(params)?; + let text = params + .get("text") + .and_then(|v| v.as_str()) + .ok_or_else(|| { + BitFunError::tool( + "[INVALID_PARAMS] app_type_text requires 'text'".to_string(), + ) + })? + .to_string(); + let focus: Option = match params.get("focus") { + Some(v) if !v.is_null() => Some(parse_click_target(v)?), + _ => None, + }; + let before = host + .get_app_state(app.clone(), 8, false) + .await + .ok() + .map(|s| s.digest); + let mut after = host + .app_type_text(app.clone(), &text, focus.clone()) + .await?; + if after.loop_warning.is_none() { + let target_sig = format!( + "focus={};len={}", + serde_json::to_string(&focus).unwrap_or_default(), + text.chars().count() + ); + after.loop_warning = loop_tracker_observe( + app.pid, + "app_type_text", + &target_sig, + before.as_deref().unwrap_or(""), + &after.digest, + ); + } + let data = json!({ + "target_app": app, + "background_input": bg, + "char_count": text.chars().count(), + "focus": focus, + "before_digest": before, + "app_state": snap_state_json(&after), + "app_state_nodes": after.nodes, + "loop_warning": after.loop_warning, + }); + Ok(vec![snap_result( + data, + Some(format!("typed {} chars", text.chars().count())), + &after, + )]) + } + "app_scroll" => { + let app = parse_selector(params)?; + let dx = params.get("dx").and_then(|v| v.as_i64()).unwrap_or(0) as i32; + let dy = params.get("dy").and_then(|v| v.as_i64()).unwrap_or(0) as i32; + let focus: Option = match params.get("focus") { + Some(v) if !v.is_null() => Some(parse_click_target(v)?), + _ => None, + }; + let after = host.app_scroll(app.clone(), focus.clone(), dx, dy).await?; + let data = json!({ + "target_app": app, + "background_input": bg, + "dx": dx, + "dy": dy, + "focus": focus, + "app_state": snap_state_json(&after), + "app_state_nodes": after.nodes, + "loop_warning": after.loop_warning, + }); + Ok(vec![snap_result( + data, + Some(format!("scrolled ({},{})", dx, dy)), + &after, + )]) + } + "app_key_chord" => { + let app = parse_selector(params)?; + let keys = parse_keys(params); + if keys.is_empty() { + return Err(BitFunError::tool( + "[INVALID_PARAMS] app_key_chord requires non-empty 'keys'".to_string(), + )); + } + let focus_idx: Option = params + .get("focus_idx") + .and_then(|v| v.as_u64()) + .map(|n| n as u32); + let after = host + .app_key_chord(app.clone(), keys.clone(), focus_idx) + .await?; + let data = json!({ + "target_app": app, + "background_input": bg, + "keys": keys, + "focus_idx": focus_idx, + "app_state": snap_state_json(&after), + "app_state_nodes": after.nodes, + "loop_warning": after.loop_warning, + }); + Ok(vec![snap_result( + data, + Some("key chord sent".to_string()), + &after, + )]) + } + "app_wait_for" => { + let app = parse_selector(params)?; + let predicate_v = params.get("predicate").cloned().ok_or_else(|| { + BitFunError::tool( + "[INVALID_PARAMS] app_wait_for requires 'predicate'".to_string(), + ) + })?; + let predicate = parse_wait_predicate(&predicate_v)?; + let timeout_ms = params + .get("timeout_ms") + .and_then(|v| v.as_u64()) + .unwrap_or(8000) as u32; + let poll_ms = params + .get("poll_ms") + .and_then(|v| v.as_u64()) + .unwrap_or(150) as u32; + let after = host + .app_wait_for(app.clone(), predicate.clone(), timeout_ms, poll_ms) + .await?; + let data = json!({ + "target_app": app, + "background_input": bg, + "predicate": predicate, + "app_state": snap_state_json(&after), + "app_state_nodes": after.nodes, + "loop_warning": after.loop_warning, + }); + Ok(vec![snap_result( + data, + Some("predicate satisfied".to_string()), + &after, + )]) + } + "build_interactive_view" => { + let app = parse_selector(params)?; + let opts: InteractiveViewOpts = match params.get("opts") { + Some(v) if !v.is_null() => serde_json::from_value(v.clone()).map_err(|e| { + BitFunError::tool(format!( + "[INVALID_PARAMS] build_interactive_view 'opts' invalid: {}", + e + )) + })?, + _ => InteractiveViewOpts::default(), + }; + let view = host.build_interactive_view(app.clone(), opts).await?; + let view_json = build_interactive_view_json(&view); + let summary = format!( + "interactive view for {} ({} elements, digest={})", + view.app.name, + view.elements.len(), + &view.digest[..view.digest.len().min(12)] + ); + Ok(vec![interactive_view_result( + view_json, + Some(summary), + &view, + )]) + } + "interactive_click" => { + let app = parse_selector(params)?; + let p: InteractiveClickParams = + serde_json::from_value(params.clone()).map_err(|e| { + BitFunError::tool(format!( + "[INVALID_PARAMS] interactive_click params invalid: {}", + e + )) + })?; + let i = p.i; + let res = host.interactive_click(app.clone(), p).await?; + let data = build_interactive_action_json( + &app, + &res, + json!({ "i": i, "action": "interactive_click" }), + ); + let summary = format!("interactive_click i={}", i); + Ok(vec![interactive_action_result(data, Some(summary), &res)]) + } + "build_visual_mark_view" => { + let app = parse_selector(params)?; + let opts: VisualMarkViewOpts = match params.get("opts") { + Some(v) if !v.is_null() => serde_json::from_value(v.clone()).map_err(|e| { + BitFunError::tool(format!( + "[INVALID_PARAMS] build_visual_mark_view 'opts' invalid: {}", + e + )) + })?, + _ => VisualMarkViewOpts::default(), + }; + let view = host.build_visual_mark_view(app.clone(), opts).await?; + let view_json = build_visual_mark_view_json(&view); + let summary = format!( + "visual mark view for {} ({} marks, digest={})", + view.app.name, + view.marks.len(), + &view.digest[..view.digest.len().min(12)] + ); + Ok(vec![visual_mark_view_result( + view_json, + Some(summary), + &view, + )]) + } + "visual_click" => { + let app = parse_selector(params)?; + let p: VisualClickParams = serde_json::from_value(params.clone()).map_err(|e| { + BitFunError::tool(format!( + "[INVALID_PARAMS] visual_click params invalid: {}", + e + )) + })?; + let i = p.i; + let res = host.visual_click(app.clone(), p).await?; + let data = build_visual_action_json( + &app, + &res, + json!({ "i": i, "action": "visual_click" }), + ); + let summary = format!("visual_click i={}", i); + Ok(vec![visual_action_result(data, Some(summary), &res)]) + } + "interactive_type_text" => { + let app = parse_selector(params)?; + let p: InteractiveTypeTextParams = + serde_json::from_value(params.clone()).map_err(|e| { + BitFunError::tool(format!( + "[INVALID_PARAMS] interactive_type_text params invalid: {}", + e + )) + })?; + let i = p.i; + let text_len = p.text.chars().count(); + let res = host.interactive_type_text(app.clone(), p).await?; + let data = build_interactive_action_json( + &app, + &res, + json!({ + "i": i, + "action": "interactive_type_text", + "text_chars": text_len, + }), + ); + let summary = match i { + Some(idx) => format!("interactive_type_text i={} ({} chars)", idx, text_len), + None => format!("interactive_type_text focused ({} chars)", text_len), + }; + Ok(vec![interactive_action_result(data, Some(summary), &res)]) + } + "interactive_scroll" => { + let app = parse_selector(params)?; + let p: InteractiveScrollParams = + serde_json::from_value(params.clone()).map_err(|e| { + BitFunError::tool(format!( + "[INVALID_PARAMS] interactive_scroll params invalid: {}", + e + )) + })?; + let (i, dx, dy) = (p.i, p.dx, p.dy); + let res = host.interactive_scroll(app.clone(), p).await?; + let data = build_interactive_action_json( + &app, + &res, + json!({ + "i": i, + "dx": dx, + "dy": dy, + "action": "interactive_scroll", + }), + ); + let summary = format!("interactive_scroll i={:?} dx={} dy={}", i, dx, dy); + Ok(vec![interactive_action_result(data, Some(summary), &res)]) + } + other => Err(BitFunError::tool(format!( + "[INTERNAL] handle_desktop_ax called with unknown action: {}", + other + ))), + } + } + + // ── Browser domain ───────────────────────────────────────────────── + + /// try in order: `gtk-launch ` (uses `.desktop` files), then a + /// direct exec of the lower-cased name (handles `firefox`, `code`, etc.), + /// and finally fall back to `xdg-open` so callers passing a URL/path by + /// accident still work. The dispatcher in `handle_system` is aware of + /// this fallback chain. + fn platform_open_command(app_name: &str) -> (String, Vec) { + #[cfg(target_os = "macos")] + { + ( + "open".to_string(), + vec!["-a".to_string(), app_name.to_string()], + ) + } + #[cfg(target_os = "windows")] + { + ( + "cmd".to_string(), + vec![ + "/C".to_string(), + "start".to_string(), + "".to_string(), + app_name.to_string(), + ], + ) + } + #[cfg(target_os = "linux")] + { + // Probe in order of correctness; the first executable on PATH wins. + // `gtk-launch` is the canonical way to start a desktop application + // by its .desktop id; if not present we fall back to a direct exec. + if which_exists("gtk-launch") { + ("gtk-launch".to_string(), vec![app_name.to_string()]) + } else { + (app_name.to_string(), vec![]) + } + } + #[cfg(not(any(target_os = "macos", target_os = "windows", target_os = "linux")))] + { + ("open".to_string(), vec![app_name.to_string()]) + } + } + + // ── System domain ────────────────────────────────────────────────── + + pub(crate) async fn handle_system( + &self, + action: &str, + params: &Value, + context: &ToolUseContext, + ) -> BitFunResult> { + match action { + "open_app" => { + let app_name = params + .get("app_name") + .and_then(|v| v.as_str()) + .ok_or_else(|| BitFunError::tool("open_app requires 'app_name'".to_string()))?; + + // Phase 4 (p4_open_app_unify): consolidate the two historical + // launch paths (ComputerUse host vs. raw shell `open`/`start`) + // into one flow: prefer the host (it knows about + // accessibility / focus-after-launch), fall back to the + // platform shell, and *always* return the same envelope so + // callers don't have to special-case the two paths. + let mut host_attempted = false; + let mut host_error: Option = None; + let method = "shell"; + + // Only macOS has a working ComputerUseHost.open_app pathway today + // (Accessibility-driven). On Windows / Linux the host either + // doesn't exist or returns a NotImplemented stub, so we save a + // round-trip by going straight to the platform shell. On macOS + // we still prefer the host because it knows about + // focus-after-launch and AX permission state. + let prefer_host = cfg!(target_os = "macos") && context.computer_use_host.is_some(); + if prefer_host { + host_attempted = true; + let cu_input = json!({ "action": "open_app", "app_name": app_name }); + match self.handle_desktop("open_app", &cu_input, context).await { + Ok(results) => { + // Re-wrap to the unified system-domain envelope so + // models see the same shape regardless of which + // backend serviced the call. + let host_payload = results + .first() + .map(|r| r.content()) + .unwrap_or(Value::Null); + return Ok(vec![ToolResult::ok( + json!({ + "launched": true, + "app": app_name, + "method": "computer_use_host", + "host_payload": host_payload, + }), + Some(format!("Opened {} via host", app_name)), + )]); + } + Err(e) => { + // Don't fail yet — try the shell fallback. Many + // hosts return error for sandboxed apps that + // launch fine via `open -a`. + host_error = Some(e.to_string()); + } + } + } + + // Build the platform-specific launch attempt list. On Linux + // we try multiple strategies in order so the model doesn't + // need to know whether the user has gtk-launch installed. + let attempts: Vec<(String, Vec)> = { + let primary = Self::platform_open_command(app_name); + #[cfg(target_os = "linux")] + { + let mut v = vec![primary]; + // Fallback 1: direct exec of the lowercase name (handles + // `firefox`, `code`, `gnome-terminal`, etc. when the + // exec name matches the app name). + let lower = app_name.to_lowercase(); + if v.iter().all(|(c, _)| c != &lower) { + v.push((lower, vec![])); + } + // Fallback 2: xdg-open — last-ditch, mostly for paths/URLs + // erroneously passed as app_name. + v.push(("xdg-open".to_string(), vec![app_name.to_string()])); + v + } + #[cfg(not(target_os = "linux"))] + { + vec![primary] + } + }; + + let mut last_err: Option = None; + let mut output_opt = None; + let mut chosen_cmd = String::new(); + let mut chosen_args: Vec = vec![]; + for (cmd, args) in &attempts { + match std::process::Command::new(cmd).args(args).output() { + Ok(out) => { + if out.status.success() { + chosen_cmd = cmd.clone(); + chosen_args = args.clone(); + output_opt = Some(out); + break; + } else { + last_err = Some(format!( + "{} exit={:?} stderr={}", + cmd, + out.status.code(), + String::from_utf8_lossy(&out.stderr).trim() + )); + } + } + Err(e) => { + last_err = Some(format!("spawn {}: {}", cmd, e)); + } + } + } + let _ = chosen_args; + let output = output_opt.ok_or_else(|| { + BitFunError::tool(format!( + "open_app failed for '{}' across {} strategies: {} (host_error: {:?})", + app_name, + attempts.len(), + last_err.as_deref().unwrap_or("(no error)"), + host_error + )) + })?; + + if output.status.success() { + let warning = host_error.map(|e| { + format!("computer_use_host open_app failed; shell fallback succeeded: {}", e) + }); + Ok(vec![ToolResult::ok( + json!({ + "launched": true, + "app": app_name, + "method": method, + "via_command": chosen_cmd, + "host_attempted": host_attempted, + "warning": warning, + }), + Some(format!("Opened {} via {}", app_name, chosen_cmd)), + )]) + } else { + let stderr = String::from_utf8_lossy(&output.stderr).trim().to_string(); + Err(BitFunError::tool(format!( + "open_app failed for '{}'. host_attempted={}, host_error={:?}, last_command='{}', stderr='{}'", + app_name, host_attempted, host_error, chosen_cmd, stderr + ))) + } + } + "run_script" => { + let script = params + .get("script") + .and_then(|v| v.as_str()) + .ok_or_else(|| BitFunError::tool("run_script requires 'script'".to_string()))?; + let script_type = params + .get("script_type") + .and_then(|v| v.as_str()) + .unwrap_or("applescript"); + // Phase 4: bound the runtime so a hung script can never wedge + // the agent. Default 30 s, capped at 5 min to keep it sane. + let timeout_ms = params + .get("timeout_ms") + .and_then(|v| v.as_u64()) + .unwrap_or(30_000) + .clamp(100, 5 * 60 * 1000); + // Phase 4: keep output payloads bounded — model context is + // expensive and most scripts are happy with the head + tail. + let max_output_bytes = params + .get("max_output_bytes") + .and_then(|v| v.as_u64()) + .unwrap_or(16 * 1024) + .clamp(1024, 256 * 1024) as usize; + + let (program, args) = match script_type { + "applescript" => { + #[cfg(target_os = "macos")] + { + ( + "/usr/bin/osascript".to_string(), + vec!["-e".to_string(), script.to_string()], + ) + } + #[cfg(not(target_os = "macos"))] + { + let _ = script; + return Ok(err_response( + "system", + "run_script", + ControlHubError::new( + ErrorCode::NotAvailable, + "AppleScript is only available on macOS", + ) + .with_hint("Use script_type='shell' (sh on Unix, PowerShell on Windows) or script_type='powershell'/'bash'"), + )); + } + } + // The "shell" alias picks the OS's *default* shell so the + // model can stay platform-agnostic. On Windows we now + // route to PowerShell rather than cmd.exe to avoid the + // GBK/CP936 stdout encoding nightmare and to give the + // model a consistent surface area. + "shell" => { + #[cfg(target_os = "windows")] + { + powershell_invocation(script) + } + #[cfg(not(target_os = "windows"))] + { + ( + "sh".to_string(), + vec!["-c".to_string(), script.to_string()], + ) + } + } + "bash" => { + // Bash is universally requested but not always on + // PATH (Windows without WSL/git-bash). Detect and + // surface a structured NotAvailable instead of a + // confusing spawn-failure error. + if !which_exists("bash") { + return Ok(err_response( + "system", + "run_script", + ControlHubError::new( + ErrorCode::NotAvailable, + "bash is not on PATH", + ) + .with_hint("Install Git for Windows / WSL, or use script_type='shell' / 'powershell' / 'cmd'"), + )); + } + ( + "bash".to_string(), + vec!["-c".to_string(), script.to_string()], + ) + } + "powershell" => { + // Prefer pwsh (PowerShell 7+, cross-platform) when + // available; fall back to legacy Windows powershell. + let prog = if which_exists("pwsh") { + "pwsh" + } else if which_exists("powershell") { + "powershell" + } else { + return Ok(err_response( + "system", + "run_script", + ControlHubError::new( + ErrorCode::NotAvailable, + "Neither pwsh nor powershell are on PATH", + ) + .with_hint("Install PowerShell, or use script_type='shell' / 'bash'"), + )); + }; + ( + prog.to_string(), + vec![ + "-NoProfile".to_string(), + "-NonInteractive".to_string(), + // -OutputEncoding utf8 is set inside the script + // wrapper below for consistent stdout handling. + "-Command".to_string(), + format!( + "[Console]::OutputEncoding=[Text.Encoding]::UTF8; {}", + script + ), + ], + ) + } + "cmd" => { + #[cfg(target_os = "windows")] + { + // Force code-page 65001 (UTF-8) before running the + // user's script so stdout matches what we decode. + ( + "cmd".to_string(), + vec![ + "/U".to_string(), + "/C".to_string(), + format!("chcp 65001>nul && {}", script), + ], + ) + } + #[cfg(not(target_os = "windows"))] + { + return Ok(err_response( + "system", + "run_script", + ControlHubError::new( + ErrorCode::NotAvailable, + "script_type='cmd' is only available on Windows", + ) + .with_hint("Use script_type='shell' / 'bash' / 'powershell'"), + )); + } + } + other => { + return Err(BitFunError::tool(format!( + "Unknown script_type: '{}'. Valid: applescript (macOS), shell (OS default), bash, powershell, cmd (Windows)", + other + ))) + } + }; + + // Use tokio::process so that on timeout we can actually KILL + // the child process. The previous implementation wrapped + // `std::process::Command::output()` in `spawn_blocking` + + // `tokio::time::timeout`; on timeout the `timeout` future + // returned, but the spawn_blocking thread kept blocking on + // the still-running child, leaking a thread + process per + // hung script. + let started = std::time::Instant::now(); + let child = tokio::process::Command::new(&program) + .args(&args) + .stdout(std::process::Stdio::piped()) + .stderr(std::process::Stdio::piped()) + .kill_on_drop(true) + .spawn() + .map_err(|e| { + BitFunError::tool(format!( + "Failed to spawn run_script ({}): {}", + script_type, e + )) + })?; + + let wait = child.wait_with_output(); + let output = match tokio::time::timeout( + std::time::Duration::from_millis(timeout_ms), + wait, + ) + .await + { + Err(_) => { + // Best-effort kill. `kill_on_drop(true)` above also + // ensures the OS reaps the process when `child` + // drops, but we issue an explicit SIGKILL first so + // it terminates immediately rather than after the + // tokio task tear-down race. + // NOTE: `wait_with_output` consumed `child`, so we + // can no longer call `child.kill()` directly here; + // the `kill_on_drop` flag handles it for us. + return Ok(err_response( + "system", + "run_script", + ControlHubError::new( + ErrorCode::Timeout, + format!( + "run_script timed out after {} ms (script_type={}); child process killed", + timeout_ms, script_type + ), + ) + .with_hint( + "Increase 'timeout_ms', or split the script into shorter steps", + ), + )); + } + Ok(Err(e)) => { + return Err(BitFunError::tool(format!( + "Failed to wait for run_script ({}): {}", + script_type, e + ))); + } + Ok(Ok(o)) => o, + }; + + let elapsed_ms = elapsed_ms_u64(started); + let stdout_full = String::from_utf8_lossy(&output.stdout).to_string(); + let stderr_full = String::from_utf8_lossy(&output.stderr).to_string(); + let (stdout, stdout_truncated) = truncate_with_marker(&stdout_full, max_output_bytes); + let (stderr, stderr_truncated) = truncate_with_marker(&stderr_full, max_output_bytes); + + if output.status.success() { + Ok(vec![ToolResult::ok( + json!({ + "success": true, + "output": stdout, + "stderr": stderr, + "stdout_truncated": stdout_truncated, + "stderr_truncated": stderr_truncated, + "exit_code": output.status.code(), + "elapsed_ms": elapsed_ms, + "script_type": script_type, + }), + Some(if stdout.is_empty() { + format!("Script executed in {} ms", elapsed_ms) + } else { + stdout.lines().take(1).collect::() + }), + )]) + } else { + Ok(err_response( + "system", + "run_script", + ControlHubError::new( + ErrorCode::Internal, + format!( + "Script exited with {:?}: {}", + output.status.code(), + stderr.lines().next().unwrap_or("(no stderr)") + ), + ) + .with_hints([ + format!("stderr={}", stderr), + format!("elapsed_ms={}", elapsed_ms), + ]), + )) + } + } + "get_os_info" => { + let os = std::env::consts::OS; + let arch = std::env::consts::ARCH; + // Phase 4: include OS version + hostname when available so + // the model can adapt platform-specific paths / commands. + let mut info = json!({ + "os": os, + "arch": arch, + "rust_target_family": std::env::consts::FAMILY, + }); + if let Some(v) = read_os_version() { + info["os_version"] = json!(v); + } + if let Ok(host) = hostname() { + info["hostname"] = json!(host); + } + // Linux-only: surface display server (X11 / Wayland) and the + // current desktop environment so the model can pick the right + // clipboard helper / window manipulation strategy without a + // separate `run_script` round-trip. + #[cfg(target_os = "linux")] + { + let (display_server, desktop_env) = linux_session_info(); + if let Some(s) = display_server { + info["display_server"] = json!(s); + } + if let Some(d) = desktop_env { + info["desktop_environment"] = json!(d); + } + } + // The set of `script_type` values the host can actually run. + // Discoverability win: model no longer has to spawn a doomed + // run_script call to learn that bash is missing on Windows. + let mut script_types = vec!["shell"]; + if cfg!(target_os = "macos") { + script_types.push("applescript"); + } + if which_exists("bash") { + script_types.push("bash"); + } + if which_exists("pwsh") || which_exists("powershell") { + script_types.push("powershell"); + } + if cfg!(target_os = "windows") { + script_types.push("cmd"); + } + info["script_types"] = json!(script_types); + Ok(vec![ToolResult::ok( + info.clone(), + Some(format!( + "{} {} ({})", + os, + info.get("os_version").and_then(|v| v.as_str()).unwrap_or(""), + arch + )), + )]) + } + // Cross-context primitive: read the system clipboard. Used by + // models to pick up "what the user just copied" (verification + // codes, selected text, generated SQL, etc.) without driving + // the GUI. Returns text only — binary clipboard payloads are + // out of scope. + "clipboard_get" => { + let max_bytes = params + .get("max_bytes") + .and_then(|v| v.as_u64()) + .map(|n| n as usize) + .unwrap_or(64 * 1024) + .clamp(64, 1024 * 1024); + + match clipboard_read().await { + Ok(text) => { + let (truncated, was_truncated) = truncate_with_marker(&text, max_bytes); + let len = text.len(); + Ok(vec![ToolResult::ok( + json!({ + "text": truncated, + "byte_length": len, + "truncated": was_truncated, + }), + Some(format!("{} bytes on clipboard", len)), + )]) + } + Err(e) => Ok(err_response( + "system", + "clipboard_get", + ControlHubError::new( + ErrorCode::NotAvailable, + format!("Clipboard read failed: {}", e), + ) + .with_hints(linux_clipboard_install_hints()), + )), + } + } + + // Cross-context primitive: place text on the system clipboard. + // The user can then paste it into ANY app with cmd+v / ctrl+v — + // dramatically simpler than driving each target GUI by hand. + "clipboard_set" => { + let text = params.get("text").and_then(|v| v.as_str()).ok_or_else(|| { + BitFunError::tool("clipboard_set requires 'text'".to_string()) + })?; + match clipboard_write(text).await { + Ok(()) => Ok(vec![ToolResult::ok( + json!({ + "success": true, + "byte_length": text.len(), + }), + Some(format!("Wrote {} bytes to clipboard", text.len())), + )]), + Err(e) => Ok(err_response( + "system", + "clipboard_set", + ControlHubError::new( + ErrorCode::NotAvailable, + format!("Clipboard write failed: {}", e), + ) + .with_hints(linux_clipboard_install_hints()), + )), + } + } + + // Cross-context primitive: open a URL in the user's default + // browser WITHOUT going through CDP. Use this when the goal is + // "show this URL to the user" rather than "drive this page". + // Avoids the CDP launch round-trip and works even when the + // browser was started without --remote-debugging-port. + "open_url" => { + let url = params + .get("url") + .and_then(|v| v.as_str()) + .ok_or_else(|| BitFunError::tool("open_url requires 'url'".to_string()))?; + if !(url.starts_with("http://") + || url.starts_with("https://") + || url.starts_with("file://") + || url.starts_with("mailto:")) + { + return Ok(err_response( + "system", + "open_url", + ControlHubError::new( + ErrorCode::InvalidParams, + format!("Refusing to open URL with unsupported scheme: {}", url), + ) + .with_hint( + "Pass an http(s)://, file://, or mailto: URL. Use 'open_file' for local paths without a scheme.", + ), + )); + } + // NOTE: do NOT reuse platform_open_command — that helper + // is for *apps* (uses `open -a` on macOS) and would treat + // the URL as an application name, failing immediately. + // + // Windows: must NOT route through `cmd /C start "" `. + // `cmd` interprets `&`, `^`, `%`, `|` in the URL — so a query + // string like `?a=1&b=2` gets the second arg dropped, and + // long URLs may be silently truncated. Use rundll32 with the + // URL protocol handler so the URL is passed verbatim and + // routed through the same default-handler resolution Windows + // uses for "Open in Browser" shell verbs. + let (program, args) = match std::env::consts::OS { + "macos" => ("open".to_string(), vec![url.to_string()]), + "windows" => ( + "rundll32".to_string(), + vec![ + "url.dll,FileProtocolHandler".to_string(), + url.to_string(), + ], + ), + _ => ("xdg-open".to_string(), vec![url.to_string()]), + }; + let status = std::process::Command::new(&program) + .args(&args) + .status() + .map_err(|e| { + BitFunError::tool(format!("Failed to spawn '{}': {}", program, e)) + })?; + if status.success() { + Ok(vec![ToolResult::ok( + json!({ "opened": true, "url": url, "method": program }), + Some(format!("Opened {} in default handler", url)), + )]) + } else { + Ok(err_response( + "system", + "open_url", + ControlHubError::new( + ErrorCode::Internal, + format!("'{}' exited with {:?}", program, status.code()), + ), + )) + } + } + + // Cross-context primitive: open a local file with its default + // handler (or an explicitly named app on macOS). High-frequency + // for "open this PDF / picture / spreadsheet for me". + "open_file" => { + let path_str = params.get("path").and_then(|v| v.as_str()).ok_or_else(|| { + BitFunError::tool("open_file requires 'path'".to_string()) + })?; + let app_name = params.get("app").and_then(|v| v.as_str()); + + let path = std::path::Path::new(path_str); + if !path.exists() { + return Ok(err_response( + "system", + "open_file", + ControlHubError::new( + ErrorCode::NotFound, + format!("File does not exist: {}", path_str), + ) + .with_hint("Check the absolute path; ~ is not expanded"), + )); + } + + let (program, args) = match (std::env::consts::OS, app_name) { + ("macos", Some(app)) => ( + "open".to_string(), + vec!["-a".to_string(), app.to_string(), path_str.to_string()], + ), + ("macos", None) => ("open".to_string(), vec![path_str.to_string()]), + // Windows file open: same rundll32 dance as open_url so + // paths with `&` / `%` survive intact when cmd would have + // mangled them. ShellExec_RunDLL also accepts file paths. + ("windows", _) => ( + "rundll32".to_string(), + vec![ + "url.dll,FileProtocolHandler".to_string(), + path_str.to_string(), + ], + ), + _ => ("xdg-open".to_string(), vec![path_str.to_string()]), + }; + let status = std::process::Command::new(&program) + .args(&args) + .status() + .map_err(|e| { + BitFunError::tool(format!("Failed to spawn '{}': {}", program, e)) + })?; + if status.success() { + Ok(vec![ToolResult::ok( + json!({ + "opened": true, + "path": path_str, + "with_app": app_name, + "method": program, + }), + Some(match app_name { + Some(a) => format!("Opened {} with {}", path_str, a), + None => format!("Opened {} with default handler", path_str), + }), + )]) + } else { + Ok(err_response( + "system", + "open_file", + ControlHubError::new( + ErrorCode::Internal, + format!("'{}' exited with {:?}", program, status.code()), + ), + )) + } + } + + other => Err(BitFunError::tool(format!( + "Unknown system action: '{}'. Valid: open_app, run_script, get_os_info, open_url, open_file, clipboard_get, clipboard_set", + other + ))), + } + } +} +/// Truncate `s` to at most `max_bytes`, appending an explicit marker so the +/// model can see that data was dropped (and how much). Returns +/// `(truncated_string, was_truncated)`. +pub(crate) fn truncate_with_marker(s: &str, max_bytes: usize) -> (String, bool) { + if s.len() <= max_bytes { + return (s.to_string(), false); + } + let head_n = max_bytes.saturating_sub(64); + let head = safe_str_slice(s, head_n); + let omitted = s.len().saturating_sub(head_n); + ( + format!("{}\n... [{} bytes omitted] ...\n", head, omitted), + true, + ) +} +/// Slice `s` to ≤ `n` bytes without splitting a UTF-8 codepoint. +fn safe_str_slice(s: &str, n: usize) -> &str { + if n >= s.len() { + return s; + } + let mut cut = n; + while cut > 0 && !s.is_char_boundary(cut) { + cut -= 1; + } + &s[..cut] +} + +/// Read a short OS version string. Best-effort: returns `None` on platforms +/// where we can't determine it cheaply. +fn read_os_version() -> Option { + #[cfg(target_os = "macos")] + { + let out = std::process::Command::new("sw_vers") + .arg("-productVersion") + .output() + .ok()?; + let s = String::from_utf8_lossy(&out.stdout).trim().to_string(); + if s.is_empty() { + None + } else { + Some(format!("macOS {}", s)) + } + } + #[cfg(target_os = "windows")] + { + let out = std::process::Command::new("cmd") + .args(["/C", "ver"]) + .output() + .ok()?; + let s = String::from_utf8_lossy(&out.stdout).trim().to_string(); + if s.is_empty() { + None + } else { + Some(s) + } + } + #[cfg(target_os = "linux")] + { + // /etc/os-release is the canonical lookup. + let txt = std::fs::read_to_string("/etc/os-release").ok()?; + for line in txt.lines() { + if let Some(rest) = line.strip_prefix("PRETTY_NAME=") { + return Some(rest.trim_matches('"').to_string()); + } + } + None + } + #[cfg(not(any(target_os = "macos", target_os = "windows", target_os = "linux")))] + { + None + } +} + +fn hostname() -> std::io::Result { + // Prefer environment variables on each OS so we never have to spawn a + // subprocess for a value that's already in our address space, and so we + // never ingest a non-UTF-8 byte stream from `hostname.exe` on Windows + // running a CJK code page. + #[cfg(target_os = "windows")] + { + if let Ok(name) = std::env::var("COMPUTERNAME") { + if !name.is_empty() { + return Ok(name); + } + } + } + #[cfg(any(target_os = "linux", target_os = "macos"))] + { + if let Ok(name) = std::env::var("HOSTNAME") { + if !name.is_empty() { + return Ok(name); + } + } + if let Ok(bytes) = std::fs::read("/etc/hostname") { + let s = String::from_utf8_lossy(&bytes).trim().to_string(); + if !s.is_empty() { + return Ok(s); + } + } + } + let out = std::process::Command::new("hostname").output()?; + Ok(String::from_utf8_lossy(&out.stdout).trim().to_string()) +} + +/// Cheap PATH lookup for an executable name. Used to decide between e.g. +/// `pwsh` and `powershell`, or to surface a structured `NOT_AVAILABLE` +/// error when the requested interpreter isn't installed. +pub(crate) fn which_exists(name: &str) -> bool { + let paths = match std::env::var_os("PATH") { + Some(p) => p, + None => return false, + }; + let exts: Vec = if cfg!(target_os = "windows") { + std::env::var("PATHEXT") + .unwrap_or_else(|_| ".EXE;.BAT;.CMD;.COM".to_string()) + .split(';') + .map(|s| s.to_string()) + .collect() + } else { + vec![String::new()] + }; + for dir in std::env::split_paths(&paths) { + for ext in &exts { + let mut candidate = dir.join(name); + if !ext.is_empty() { + let stem = candidate.file_name().map(|n| n.to_os_string()); + if let Some(mut stem) = stem { + stem.push(ext); + candidate.set_file_name(stem); + } + } + if candidate.exists() { + return true; + } + } + } + false +} + +/// Build a `(program, args)` pair for invoking a PowerShell snippet on Windows +/// with UTF-8 output forced. Centralised so the "shell" alias and an explicit +/// `script_type='powershell'` produce the same encoding. +#[cfg(target_os = "windows")] +fn powershell_invocation(script: &str) -> (String, Vec) { + let prog = if which_exists("pwsh") { + "pwsh" + } else { + "powershell" + }; + ( + prog.to_string(), + vec![ + "-NoProfile".to_string(), + "-NonInteractive".to_string(), + "-Command".to_string(), + format!( + "[Console]::OutputEncoding=[Text.Encoding]::UTF8; {}", + script + ), + ], + ) +} + +/// Build OS-specific install hints for the clipboard helper. On Linux we +/// inspect the session type so the suggestion matches what the user actually +/// needs (Wayland users wasting time installing xclip is a real failure mode). +pub(crate) fn linux_clipboard_install_hints() -> Vec { + match std::env::consts::OS { + "linux" => { + #[cfg(target_os = "linux")] + { + let (server, _) = linux_session_info(); + match server.as_deref() { + Some("wayland") => vec![ + "Wayland session detected — install wl-clipboard (e.g. `sudo apt install wl-clipboard` / `sudo dnf install wl-clipboard`)".to_string(), + "Fallback for XWayland apps: also install xclip or xsel".to_string(), + ], + Some("x11") | Some("tty") => vec![ + "X11 session detected — install xclip (`sudo apt install xclip`) or xsel (`sudo apt install xsel`)".to_string(), + ], + _ => vec![ + "Install wl-clipboard (Wayland) OR xclip/xsel (X11). Run `echo $XDG_SESSION_TYPE` to know which one applies.".to_string(), + ], + } + } + #[cfg(not(target_os = "linux"))] + { + vec!["Install wl-clipboard (Wayland) or xclip/xsel (X11)".to_string()] + } + } + _ => vec!["Make sure the system clipboard helper is available on this host".to_string()], + } +} +/// Best-effort detection of the Linux desktop session metadata (display +/// server + desktop environment). Returns `(display_server, desktop_env)`, +/// either of which may be `None` if the environment doesn't expose it. +#[cfg(target_os = "linux")] +pub(crate) fn linux_session_info() -> (Option, Option) { + let server = std::env::var("XDG_SESSION_TYPE") + .ok() + .filter(|s| !s.is_empty()); + let de = std::env::var("XDG_CURRENT_DESKTOP") + .ok() + .or_else(|| std::env::var("DESKTOP_SESSION").ok()) + .filter(|s| !s.is_empty()); + (server, de) +} + +/// Cross-platform clipboard read. Shells out to the canonical helper for +/// the current OS so we don't pull in a heavyweight dependency for what is +/// fundamentally a 1-line operation. Linux auto-detects Wayland → X11. +async fn clipboard_read() -> Result { + #[cfg(target_os = "macos")] + { + let out = tokio::process::Command::new("pbpaste") + .output() + .await + .map_err(|e| format!("spawn pbpaste: {}", e))?; + if !out.status.success() { + return Err(format!("pbpaste exit={:?}", out.status.code())); + } + Ok(String::from_utf8_lossy(&out.stdout).to_string()) + } + #[cfg(target_os = "windows")] + { + let out = tokio::process::Command::new("powershell") + .args(["-NoProfile", "-Command", "Get-Clipboard -Raw"]) + .output() + .await + .map_err(|e| format!("spawn powershell: {}", e))?; + if !out.status.success() { + return Err(format!("Get-Clipboard exit={:?}", out.status.code())); + } + // PowerShell appends CRLF; trim a single trailing newline so the + // returned text matches what the user actually copied. + let mut s = String::from_utf8_lossy(&out.stdout).to_string(); + if s.ends_with("\r\n") { + s.truncate(s.len() - 2); + } else if s.ends_with('\n') { + s.truncate(s.len() - 1); + } + Ok(s) + } + #[cfg(target_os = "linux")] + { + // Wayland first (modern session), then X11 fallbacks. + let candidates: &[(&str, &[&str])] = if std::env::var("WAYLAND_DISPLAY").is_ok() { + &[ + ("wl-paste", &["--no-newline"]), + ("xclip", &["-selection", "clipboard", "-o"]), + ("xsel", &["--clipboard", "--output"]), + ] + } else { + &[ + ("xclip", &["-selection", "clipboard", "-o"]), + ("xsel", &["--clipboard", "--output"]), + ("wl-paste", &["--no-newline"]), + ] + }; + for (bin, args) in candidates { + if let Ok(out) = tokio::process::Command::new(bin).args(*args).output().await { + if out.status.success() { + return Ok(String::from_utf8_lossy(&out.stdout).to_string()); + } + } + } + Err("no clipboard helper found (install wl-clipboard, xclip, or xsel)".to_string()) + } + #[cfg(not(any(target_os = "macos", target_os = "windows", target_os = "linux")))] + { + Err("clipboard not implemented for this OS".to_string()) + } +} + +/// Cross-platform clipboard write. Streams `text` into the helper's stdin +/// rather than embedding it in argv so newlines / quotes / shell metachars +/// are preserved verbatim. +async fn clipboard_write(text: &str) -> Result<(), String> { + use tokio::io::AsyncWriteExt; + + async fn pipe(bin: &str, args: &[&str], text: &str) -> Result<(), String> { + let mut child = tokio::process::Command::new(bin) + .args(args) + .stdin(std::process::Stdio::piped()) + .stdout(std::process::Stdio::null()) + .stderr(std::process::Stdio::piped()) + .spawn() + .map_err(|e| format!("spawn {}: {}", bin, e))?; + if let Some(mut stdin) = child.stdin.take() { + stdin + .write_all(text.as_bytes()) + .await + .map_err(|e| format!("write {} stdin: {}", bin, e))?; + } + let out = child + .wait_with_output() + .await + .map_err(|e| format!("wait {}: {}", bin, e))?; + if !out.status.success() { + return Err(format!("{} exit={:?}", bin, out.status.code())); + } + Ok(()) + } + + #[cfg(target_os = "macos")] + { + pipe("pbcopy", &[], text).await + } + #[cfg(target_os = "windows")] + { + // PowerShell's Set-Clipboard reads from the pipeline; pipe text in + // via stdin to preserve binary fidelity. + pipe( + "powershell", + &["-NoProfile", "-Command", "$input | Set-Clipboard"], + text, + ) + .await + } + #[cfg(target_os = "linux")] + { + let candidates: &[(&str, &[&str])] = if std::env::var("WAYLAND_DISPLAY").is_ok() { + &[ + ("wl-copy", &[]), + ("xclip", &["-selection", "clipboard"]), + ("xsel", &["--clipboard", "--input"]), + ] + } else { + &[ + ("xclip", &["-selection", "clipboard"]), + ("xsel", &["--clipboard", "--input"]), + ("wl-copy", &[]), + ] + }; + let mut last_err = String::new(); + for (bin, args) in candidates { + match pipe(bin, args, text).await { + Ok(()) => return Ok(()), + Err(e) => last_err = e, + } + } + Err(format!( + "no clipboard helper succeeded (install wl-clipboard, xclip, or xsel): {}", + last_err + )) + } + #[cfg(not(any(target_os = "macos", target_os = "windows", target_os = "linux")))] + { + let _ = text; + Err("clipboard not implemented for this OS".to_string()) + } +} diff --git a/src/crates/core/src/agentic/tools/implementations/computer_use_tool.rs b/src/crates/core/src/agentic/tools/implementations/computer_use_tool.rs index 805a74bc8..723863a25 100644 --- a/src/crates/core/src/agentic/tools/implementations/computer_use_tool.rs +++ b/src/crates/core/src/agentic/tools/implementations/computer_use_tool.rs @@ -115,6 +115,28 @@ The **primary model cannot consume images** in tool results — **do not** use * ) } + fn is_controlhub_migrated_desktop_action(action: &str) -> bool { + matches!( + action, + "list_displays" + | "focus_display" + | "paste" + | "list_apps" + | "get_app_state" + | "app_click" + | "app_type_text" + | "app_scroll" + | "app_key_chord" + | "app_wait_for" + | "build_interactive_view" + | "interactive_click" + | "interactive_type_text" + | "interactive_scroll" + | "build_visual_mark_view" + | "visual_click" + ) + } + /// JSON Schema without `screenshot` or screenshot-only fields. fn input_schema_text_only() -> Value { json!({ @@ -122,7 +144,7 @@ The **primary model cannot consume images** in tool results — **do not** use * "properties": { "action": { "type": "string", - "enum": ["click_target", "move_to_target", "click_element", "move_to_text", "click", "mouse_move", "scroll", "drag", "locate", "key_chord", "type_text", "pointer_move_rel", "wait", "open_app", "run_apple_script"], + "enum": ["click_target", "move_to_target", "click_element", "move_to_text", "click", "mouse_move", "scroll", "drag", "locate", "key_chord", "type_text", "pointer_move_rel", "wait", "list_displays", "focus_display", "paste", "list_apps", "get_app_state", "app_click", "app_type_text", "app_scroll", "app_key_chord", "app_wait_for", "build_interactive_view", "interactive_click", "interactive_type_text", "interactive_scroll", "build_visual_mark_view", "visual_click", "open_app", "open_url", "open_file", "clipboard_get", "clipboard_set", "run_script", "run_apple_script", "get_os_info"], "description": "The action to perform. **Primary model is text-only — no `screenshot`.** **ACTION PRIORITY:** 1) Use Bash tool for CLI/terminal/system commands first. 2) **`open_app`** to launch apps. **`run_apple_script`** for AppleScript (macOS). 3) Prefer `key_chord` for shortcuts/navigation. 4) Only when above fail: `click_target` / `move_to_target` (AX → OCR → screen coords in one call), then lower-level `click_element`, `move_to_text`, or `mouse_move` + `click`. Never guess coordinates." }, "x": { "type": "integer", "description": "For `mouse_move` and `drag`: X in **global display** units when **`use_screen_coordinates`: true** (required). **Not** for `click`." }, @@ -158,12 +180,37 @@ The **primary model cannot consume images** in tool results — **do not** use * "role_substring": { "type": "string", "description": "For `locate`, `click_element`: case-insensitive substring on AXRole **or AXSubrole** (e.g. \"Button\", \"SearchField\")." }, "identifier_contains": { "type": "string", "description": "For `locate`, `click_element`: case-insensitive substring on AXIdentifier." }, "text_contains": { "type": "string", "description": "For `locate`, `click_element`: case-insensitive substring matched against ANY of AXTitle / AXValue / AXDescription / AXHelp. Prefer this when the visible text is shown via value/description (e.g. AXStaticText cards) instead of title." }, - "node_idx": { "type": "integer", "minimum": 0, "description": "For `locate`, `click_element`: jump straight to a node returned by the most recent `desktop.get_app_state` (field `idx`). Bypasses BFS. macOS only; other platforms return AX_IDX_NOT_SUPPORTED." }, + "node_idx": { "type": "integer", "minimum": 0, "description": "For `locate`, `click_element`, `app_click`: jump straight to a node returned by the most recent `get_app_state` (field `idx`). Bypasses BFS. macOS only; other platforms return AX_IDX_NOT_SUPPORTED." }, "app_state_digest": { "type": "string", "description": "For `locate`, `click_element`: optional `state_digest` from the same `get_app_state` call that produced `node_idx`. Stale digest yields AX_IDX_STALE so you re-snapshot." }, "max_depth": { "type": "integer", "minimum": 1, "maximum": 200, "description": "For `locate`, `click_element`: max BFS depth (default 48). Ignored when `node_idx` is supplied." }, "filter_combine": { "type": "string", "enum": ["all", "any"], "description": "For `locate`, `click_element`: `all` (default, AND) or `any` (OR) for filter combination. Priority: `node_idx` > `text_contains` > `title_contains`+`role_substring`." }, "app_name": { "type": "string", "description": "For `open_app`: the application name to launch." }, + "url": { "type": "string", "description": "For `open_url`: URL to open with the system/default browser." }, + "path": { "type": "string", "description": "For `open_file`: local file path to open with its default handler." }, + "app": { "type": ["string", "object"], "description": "For `open_file`: optional app name. For app-scoped actions: selector object such as `{ \"name\": \"Safari\" }`, `{ \"bundle_id\": \"...\" }`, or `{ \"pid\": 123 }`." }, "script": { "type": "string", "description": "For `run_apple_script`: the AppleScript code to execute. macOS only." }, + "script_type": { "type": "string", "enum": ["applescript", "shell", "bash", "powershell", "cmd"], "description": "For `run_script`: script interpreter/type." }, + "timeout_ms": { "type": "integer", "description": "For `run_script`: timeout in milliseconds." }, + "max_output_bytes": { "type": "integer", "description": "For `run_script` / `clipboard_get`: maximum bytes to return." }, + "clear_first": { "type": "boolean", "description": "For `paste`: select all before pasting." }, + "submit": { "type": "boolean", "description": "For `paste`: press submit keys after pasting." }, + "submit_keys": { "type": "array", "items": { "type": "string" }, "description": "For `paste`: key chord to submit, default `[\"return\"]`." }, + "display_id": { "type": ["integer", "null"], "description": "For `focus_display` or display-pinned desktop actions: display id, or null to clear the pin." }, + "include_hidden": { "type": "boolean", "description": "For `list_apps`: include hidden/background apps." }, + "only_visible": { "type": "boolean", "description": "For `list_apps`: list only visible apps when true." }, + "target": { "type": "object", "description": "For `app_click`: click target such as `{ \"node_idx\": 3 }`, image/screen coordinates, or OCR text." }, + "focus": { "type": ["object", "null"], "description": "For app-scoped text/scroll actions: optional focus target." }, + "predicate": { "type": "object", "description": "For `app_wait_for`: wait predicate." }, + "opts": { "type": "object", "description": "For `build_interactive_view` / `build_visual_mark_view`: optional view options." }, + "i": { "type": ["integer", "null"], "description": "For interactive/visual actions: element or mark index from the latest view." }, + "dx": { "type": "integer", "description": "For app/interactive scroll actions: horizontal delta." }, + "dy": { "type": "integer", "description": "For app/interactive scroll actions: vertical delta." }, + "mouse_button": { "type": "string", "enum": ["left", "right", "middle"], "description": "For app/interactive/visual click actions." }, + "click_count": { "type": "integer", "minimum": 1, "maximum": 3, "description": "For app click actions." }, + "modifier_keys": { "type": "array", "items": { "type": "string" }, "description": "For app click actions: modifier keys to hold." }, + "wait_ms_after": { "type": "integer", "description": "For app click actions: post-click wait in milliseconds." }, + "focus_idx": { "type": "integer", "minimum": 0, "description": "For `app_key_chord`: optional node index to focus first." }, + "poll_ms": { "type": "integer", "description": "For `app_wait_for`: polling interval." }, "scroll_x": { "type": "integer", "description": "For `scroll`: optional global X coordinate to scroll at. Use with `scroll_y`." }, "scroll_y": { "type": "integer", "description": "For `scroll`: optional global Y coordinate to scroll at. Use with `scroll_x`." } }, @@ -1246,7 +1293,7 @@ impl Tool for ComputerUseTool { "properties": { "action": { "type": "string", - "enum": ["screenshot", "click_target", "move_to_target", "click_element", "move_to_text", "click", "mouse_move", "scroll", "drag", "locate", "key_chord", "type_text", "pointer_move_rel", "wait", "open_app", "run_apple_script"], + "enum": ["screenshot", "click_target", "move_to_target", "click_element", "move_to_text", "click", "mouse_move", "scroll", "drag", "locate", "key_chord", "type_text", "pointer_move_rel", "wait", "list_displays", "focus_display", "paste", "list_apps", "get_app_state", "app_click", "app_type_text", "app_scroll", "app_key_chord", "app_wait_for", "build_interactive_view", "interactive_click", "interactive_type_text", "interactive_scroll", "build_visual_mark_view", "visual_click", "open_app", "open_url", "open_file", "clipboard_get", "clipboard_set", "run_script", "run_apple_script", "get_os_info"], "description": "The action to perform. **ACTION PRIORITY:** 1) Use Bash tool for CLI/terminal/system commands (most efficient). 2) **`open_app`** to launch apps by name. **`run_apple_script`** to run AppleScript (macOS). 3) Prefer **`key_chord`** for shortcuts/navigation keys over mouse. 4) Only when above fail: `click_target` / `move_to_target` (AX → OCR → screen coords in one call) before lower-level `click_element`, `move_to_text`, or `mouse_move` + `click`. **`screenshot`** is for observation/confirmation ONLY — never derive mouse coordinates from screenshots. `click` = press at **current pointer only** (no x/y params). `scroll` supports optional position (`scroll_x`/`scroll_y`). `type_text`, `drag`, `pointer_move_rel`, `wait`, `locate` = standard actions." }, "x": { "type": "integer", "description": "For `mouse_move` and `drag`: X in **global display** units when **`use_screen_coordinates`: true** (required). **Not** for `click`." }, @@ -1282,7 +1329,7 @@ impl Tool for ComputerUseTool { "role_substring": { "type": "string", "description": "For `locate`, `click_element`: case-insensitive substring on AXRole **or AXSubrole** (e.g. \"Button\", \"TextField\", \"SearchField\")." }, "identifier_contains": { "type": "string", "description": "For `locate`, `click_element`: case-insensitive substring on AXIdentifier." }, "text_contains": { "type": "string", "description": "For `locate`, `click_element`: case-insensitive substring matched against ANY of AXTitle / AXValue / AXDescription / AXHelp. Best default when the visible label lives in value/description (e.g. AXStaticText cards)." }, - "node_idx": { "type": "integer", "minimum": 0, "description": "For `locate`, `click_element`: jump straight to a node returned by the most recent `desktop.get_app_state` (field `idx`). Bypasses BFS. macOS only; other platforms return AX_IDX_NOT_SUPPORTED." }, + "node_idx": { "type": "integer", "minimum": 0, "description": "For `locate`, `click_element`, `app_click`: jump straight to a node returned by the most recent `get_app_state` (field `idx`). Bypasses BFS. macOS only; other platforms return AX_IDX_NOT_SUPPORTED." }, "app_state_digest": { "type": "string", "description": "For `locate`, `click_element`: optional `state_digest` from the same `get_app_state` call that produced `node_idx`. Stale digest yields AX_IDX_STALE so you re-snapshot." }, "max_depth": { "type": "integer", "minimum": 1, "maximum": 200, "description": "For `locate`, `click_element`: max BFS depth (default 48). Ignored when `node_idx` is supplied." }, "filter_combine": { "type": "string", "enum": ["all", "any"], "description": "For `locate`, `click_element`: `all` (default, AND) or `any` (OR) for filter combination. Priority: `node_idx` > `text_contains` > `title_contains`+`role_substring`." }, @@ -1293,7 +1340,32 @@ impl Tool for ComputerUseTool { "screenshot_reset_navigation": { "type": "boolean", "description": "For `screenshot`: reset to full display before this capture." }, "screenshot_implicit_center": { "type": "string", "enum": ["mouse", "text_caret"], "description": "For `screenshot` when `requires_fresh_screenshot_before_click` / `requires_fresh_screenshot_before_enter` is true: center the implicit ~500×500 on the mouse (`mouse`, default) or on the focused text control (`text_caret`, macOS AX; falls back to mouse). Applies to the **first** confirmation capture too. Ignored when you set `screenshot_crop_center_*` / `screenshot_navigate_quadrant` / `screenshot_reset_navigation`." }, "app_name": { "type": "string", "description": "For `open_app`: the application name to launch (e.g. \"Safari\", \"WeChat\", \"Visual Studio Code\")." }, + "url": { "type": "string", "description": "For `open_url`: URL to open with the system/default browser." }, + "path": { "type": "string", "description": "For `open_file`: local file path to open with its default handler." }, + "app": { "type": ["string", "object"], "description": "For `open_file`: optional app name. For app-scoped actions: selector object such as `{ \"name\": \"Safari\" }`, `{ \"bundle_id\": \"...\" }`, or `{ \"pid\": 123 }`." }, "script": { "type": "string", "description": "For `run_apple_script`: the AppleScript code to execute via `osascript`. macOS only." }, + "script_type": { "type": "string", "enum": ["applescript", "shell", "bash", "powershell", "cmd"], "description": "For `run_script`: script interpreter/type." }, + "timeout_ms": { "type": "integer", "description": "For `run_script`: timeout in milliseconds." }, + "max_output_bytes": { "type": "integer", "description": "For `run_script` / `clipboard_get`: maximum bytes to return." }, + "clear_first": { "type": "boolean", "description": "For `paste`: select all before pasting." }, + "submit": { "type": "boolean", "description": "For `paste`: press submit keys after pasting." }, + "submit_keys": { "type": "array", "items": { "type": "string" }, "description": "For `paste`: key chord to submit, default `[\"return\"]`." }, + "display_id": { "type": ["integer", "null"], "description": "For `focus_display` or display-pinned desktop actions: display id, or null to clear the pin." }, + "include_hidden": { "type": "boolean", "description": "For `list_apps`: include hidden/background apps." }, + "only_visible": { "type": "boolean", "description": "For `list_apps`: list only visible apps when true." }, + "target": { "type": "object", "description": "For `app_click`: click target such as `{ \"node_idx\": 3 }`, image/screen coordinates, or OCR text." }, + "focus": { "type": ["object", "null"], "description": "For app-scoped text/scroll actions: optional focus target." }, + "predicate": { "type": "object", "description": "For `app_wait_for`: wait predicate." }, + "opts": { "type": "object", "description": "For `build_interactive_view` / `build_visual_mark_view`: optional view options." }, + "i": { "type": ["integer", "null"], "description": "For interactive/visual actions: element or mark index from the latest view." }, + "dx": { "type": "integer", "description": "For app/interactive scroll actions: horizontal delta." }, + "dy": { "type": "integer", "description": "For app/interactive scroll actions: vertical delta." }, + "mouse_button": { "type": "string", "enum": ["left", "right", "middle"], "description": "For app/interactive/visual click actions." }, + "click_count": { "type": "integer", "minimum": 1, "maximum": 3, "description": "For app click actions." }, + "modifier_keys": { "type": "array", "items": { "type": "string" }, "description": "For app click actions: modifier keys to hold." }, + "wait_ms_after": { "type": "integer", "description": "For app click actions: post-click wait in milliseconds." }, + "focus_idx": { "type": "integer", "minimum": 0, "description": "For `app_key_chord`: optional node index to focus first." }, + "poll_ms": { "type": "integer", "description": "For `app_wait_for`: polling interval." }, "scroll_x": { "type": "integer", "description": "For `scroll`: optional global X coordinate to move pointer before scrolling. Use with `scroll_y`. Requires `use_screen_coordinates`: true." }, "scroll_y": { "type": "integer", "description": "For `scroll`: optional global Y coordinate to move pointer before scrolling. Use with `scroll_x`. Requires `use_screen_coordinates`: true." } }, @@ -1347,6 +1419,28 @@ impl Tool for ComputerUseTool { "ComputerUse cannot run while the session workspace is remote (SSH).".to_string(), )); } + + let action = input + .get("action") + .and_then(|v| v.as_str()) + .ok_or_else(|| BitFunError::tool("action is required".to_string()))?; + + match action { + "open_url" | "open_file" | "clipboard_get" | "clipboard_set" | "run_script" + | "get_os_info" => { + return super::computer_use_actions::ComputerUseActions::new() + .handle_system(action, input, context) + .await; + } + _ => {} + } + + if Self::is_controlhub_migrated_desktop_action(action) { + return super::computer_use_actions::ComputerUseActions::new() + .handle_desktop(action, input, context) + .await; + } + let host = context.computer_use_host.as_ref().ok_or_else(|| { BitFunError::tool( "Computer use is only available in the BitFun desktop app.".to_string(), @@ -1355,11 +1449,6 @@ impl Tool for ComputerUseTool { let host_ref = host.as_ref(); - let action = input - .get("action") - .and_then(|v| v.as_str()) - .ok_or_else(|| BitFunError::tool("action is required".to_string()))?; - match action { "locate" => execute_computer_use_locate(input, context).await, diff --git a/src/crates/core/src/agentic/tools/implementations/control_hub/errors.rs b/src/crates/core/src/agentic/tools/implementations/control_hub/errors.rs index 15b5346c9..4639f00b5 100644 --- a/src/crates/core/src/agentic/tools/implementations/control_hub/errors.rs +++ b/src/crates/core/src/agentic/tools/implementations/control_hub/errors.rs @@ -48,7 +48,8 @@ pub enum ErrorCode { AppNotFound, /// AX-first desktop: a node `idx` provided by the caller is no longer /// valid because the host has re-dumped the tree since the snapshot - /// the caller saw. Re-acquire via `desktop.get_app_state` and retry. + /// the caller saw. Re-acquire via `ComputerUse` action `get_app_state` + /// and retry. AxNodeStale, /// AX-first desktop: this host cannot inject input events into the /// target app without stealing user focus (e.g. macOS without @@ -59,8 +60,8 @@ pub enum ErrorCode { /// AX-first desktop: the `node_idx` supplied to `click_element` / /// `locate_element` is no longer present in the cached snapshot /// (re-dump happened or window/state churned). Distinct from - /// `AX_NODE_STALE` which is for `app_*` actions; same recovery — - /// re-call `desktop.get_app_state` and reuse the new idx. + /// `AX_NODE_STALE` which is for `app_*` actions; same recovery: re-call + /// `ComputerUse` action `get_app_state` and reuse the new idx. AxIdxStale, /// AX-first desktop: this platform host does not support resolving /// elements by `node_idx` (currently linux/windows). Caller should diff --git a/src/crates/core/src/agentic/tools/implementations/control_hub_tool.rs b/src/crates/core/src/agentic/tools/implementations/control_hub_tool.rs index e2bc00ba2..6e61d283f 100644 --- a/src/crates/core/src/agentic/tools/implementations/control_hub_tool.rs +++ b/src/crates/core/src/agentic/tools/implementations/control_hub_tool.rs @@ -1,10 +1,12 @@ -//! ControlHub — unified entry point for all control capabilities. +//! ControlHub — unified entry point for browser, terminal, and routing metadata. //! //! Routes requests by `domain` to the appropriate backend: -//! desktop → ComputerUseHost (existing) //! browser → CDP-based browser control (new) //! terminal → TerminalApi (existing) -//! system → OS-level utilities (open_app, run_script, etc.) +//! meta → capability and route introspection +//! +//! Local desktop and OS/system actions are intentionally surfaced through the +//! dedicated ComputerUse tool/agent, not through public ControlHub domains. use crate::agentic::tools::browser_control::actions::BrowserActions; use crate::agentic::tools::browser_control::browser_launcher::{ @@ -14,23 +16,17 @@ use crate::agentic::tools::browser_control::cdp_client::CdpClient; use crate::agentic::tools::browser_control::session_registry::{ BrowserSession, BrowserSessionRegistry, }; -use crate::agentic::tools::computer_use_capability::computer_use_desktop_available; -use crate::agentic::tools::computer_use_host::{ - AppClickParams, AppSelector, AppWaitPredicate, ClickTarget, ComputerUseForegroundApplication, - ComputerUseHostRef, InteractiveClickParams, InteractiveScrollParams, InteractiveTypeTextParams, - InteractiveViewOpts, VisualClickParams, VisualMarkViewOpts, -}; use crate::agentic::tools::framework::{ Tool, ToolRenderOptions, ToolResult, ToolUseContext, ValidationResult, }; -use crate::service::config::global::GlobalConfigManager; -use crate::service::config::types::AIConfig; -use crate::util::elapsed_ms_u64; use crate::util::errors::{BitFunError, BitFunResult}; use async_trait::async_trait; use serde_json::{json, Value}; use std::sync::Arc; +#[cfg(target_os = "linux")] +use super::computer_use_actions::linux_session_info; +use super::computer_use_actions::{truncate_with_marker, which_exists}; use super::control_hub::{err_response, ControlHubError, ErrorCode}; /// Process-wide registry of CDP sessions. Replaces the previous single @@ -41,55 +37,6 @@ use super::control_hub::{err_response, ControlHubError, ErrorCode}; static BROWSER_SESSIONS: std::sync::OnceLock> = std::sync::OnceLock::new(); -/// Per-PID consecutive-failure tracker for the AX-first `app_*` actions. -/// Key = target PID, value = `(target_signature, before_digest, count)`. -/// When the same `(action,target)` lands on an unchanged digest twice in a -/// row the dispatcher injects an `app_state.loop_warning` so the model is -/// forced off the failing path on its **next** turn (`/Screenshot policy/ -/// Mandatory screenshot moments` in `claw_mode.md`). -static APP_LOOP_TRACKER: std::sync::OnceLock< - std::sync::Mutex>, -> = std::sync::OnceLock::new(); - -fn loop_tracker_observe( - pid: Option, - action: &str, - target_sig: &str, - before_digest: &str, - after_digest: &str, -) -> Option { - let pid = pid?; - // A digest change means the action mutated the tree — that is real - // progress and resets the streak even if the model picks the same - // target name on purpose (e.g. clicking "Next" repeatedly). - let progressed = before_digest != after_digest; - let sig = format!("{action}:{target_sig}"); - let mut guard = APP_LOOP_TRACKER - .get_or_init(|| std::sync::Mutex::new(std::collections::HashMap::new())) - .lock() - .ok()?; - let entry = guard - .entry(pid) - .or_insert_with(|| (String::new(), String::new(), 0)); - if progressed { - *entry = (sig, after_digest.to_string(), 1); - return None; - } - if entry.0 == sig && entry.1 == before_digest { - entry.2 = entry.2.saturating_add(1); - } else { - *entry = (sig, before_digest.to_string(), 1); - } - if entry.2 >= 2 { - Some(format!( - "Detected {} consecutive `{}` calls on the same target ({}) without any AX tree mutation (digest unchanged). The target is almost certainly invisible / disabled / in a Canvas-WebGL surface that AX cannot describe. NEXT TURN you MUST: (1) run `desktop.screenshot {{ screenshot_window: false }}` to see the full display, (2) switch tactic — different `node_idx`, different `ocr_text` needle, or a keyboard shortcut. Do NOT retry this same target a third time.", - entry.2, action, target_sig - )) - } else { - None - } -} - fn browser_sessions() -> Arc { BROWSER_SESSIONS .get_or_init(|| Arc::new(BrowserSessionRegistry::new())) @@ -140,384 +87,41 @@ impl ControlHubTool { ] } - fn desktop_browser_guard_error( - action: &str, - foreground: Option<&ComputerUseForegroundApplication>, - ) -> ControlHubError { - let app_name = foreground - .and_then(|app| app.name.as_deref()) - .unwrap_or("a web browser"); - ControlHubError::new( - ErrorCode::GuardRejected, - format!( - "desktop.{} is blocked while {} is frontmost. Use ControlHub domain=\"browser\" for all browser interaction; desktop mouse/keyboard browser control is forbidden.", - action, app_name - ), - ) - .with_hints([ - "Use browser.connect to attach via the test port, then drive the page with snapshot/click/fill/press_key", - "For login/cookies/extensions, guide the user to start their default browser with the test port enabled before calling browser.connect", - "For isolated project Web UI testing, use the headless browser flow instead of desktop automation", - ]) - } - - fn is_probably_browser_app(foreground: &ComputerUseForegroundApplication) -> bool { - let name = foreground - .name - .as_deref() - .unwrap_or("") - .to_ascii_lowercase(); - let bundle = foreground - .bundle_id - .as_deref() - .unwrap_or("") - .to_ascii_lowercase(); - - const NAME_HINTS: &[&str] = &[ - "chrome", - "chromium", - "edge", - "brave", - "arc", - "firefox", - "safari", - "browser", - "浏览器", - ]; - const BUNDLE_HINTS: &[&str] = &[ - "chrome", "chromium", "edge", "brave", "arc", "firefox", "safari", "browser", - ]; - - NAME_HINTS.iter().any(|hint| name.contains(hint)) - || BUNDLE_HINTS.iter().any(|hint| bundle.contains(hint)) - } - - async fn desktop_action_targets_browser( - &self, - action: &str, - context: &ToolUseContext, - ) -> Option { - let guarded_actions = [ - "click", - "click_target", - "click_element", - "move_to_target", - "mouse_move", - "pointer_move_rel", - "scroll", - "drag", - "key_chord", - "type_text", - "paste", - "locate", - "move_to_text", - ]; - if !guarded_actions.contains(&action) { - return None; - } - let host = context.computer_use_host.as_ref()?; - let snapshot = host.computer_use_session_snapshot().await; - let foreground = snapshot.foreground_application.as_ref()?; - if Self::is_probably_browser_app(foreground) { - return Some(Self::desktop_browser_guard_error(action, Some(foreground))); - } - None - } - - async fn desktop_domain_enabled() -> bool { - if !computer_use_desktop_available() { - return false; - } - let Ok(service) = GlobalConfigManager::get_service().await else { - return false; - }; - let ai: AIConfig = service.get_config(Some("ai")).await.unwrap_or_default(); - ai.computer_use_enabled - } - - fn description_text(desktop_enabled: bool) -> String { - let desktop_domain_doc = if desktop_enabled { - r#"### domain: "desktop" (Computer Use — only available in the BitFun desktop app) - -#### desktop (AX-first, recommended for third-party apps) -- New Codex-style flow that targets a specific application by name / bundle - id / pid and drives it through its Accessibility (AX) tree instead of the - global mouse + screenshot loop. Strongly preferred whenever: - * you need to drive an app that is NOT in the user's foreground, OR - * you must not steal the user's mouse / keyboard focus, OR - * the target widget has a stable AX role / title / identifier (most native - macOS / AppKit / Catalyst / SwiftUI / Electron-with-AX-on apps qualify). -- Capability gating (read first, ALWAYS): `meta.capabilities` returns - `domains.desktop.supports_ax_tree`, `domains.desktop.supports_background_input`, - `domains.desktop.supports_interactive_view`, and - `domains.desktop.supports_visual_mark_view`. - AX tree and background input both `false` → the host cannot do AX-first yet; - fall back to the legacy screenshot/click flow below. Background input - `false` while AX tree `true` → AX *reads* work but writes will steal focus; - tell the user. -- Actions (all under `domain: "desktop"`): - * `list_apps { include_hidden? }` → ranked `[{ name, bundle_id?, pid, - is_running, last_used_ms?, launch_count? }]`. Use this to resolve a - fuzzy user phrase ("微信" / "WeChat" / "Cursor") to a concrete - `AppSelector` before any other AX call. - * `get_app_state { app: , max_depth?, focus_window_only? }` - → `{ app, window_title?, tree_text, nodes:[AxNode], digest, captured_at_ms }`. - `tree_text` is the human-readable indent dump (Codex parity); `nodes` is - the structured array with stable `idx` you pass to subsequent actions. - `digest` is a sha1 of the tree — use it to detect "did anything change?" - cheaply without re-diffing. - * `app_click { app, target: { kind:"node_idx", idx } | { kind:"image_xy", x, y, screenshot_id? } | { kind:"image_grid", x0, y0, width, height, rows, cols, row, col, intersections?, screenshot_id? } | { kind:"visual_grid", rows, cols, row, col, intersections? } | { kind:"screen_xy", x, y }, - click_count?, mouse_button?, modifier_keys?, wait_ms_after? }` → returns the - fresh `AppStateSnapshot` after the click. Prefer `node_idx` over - coordinate targets whenever the target appears in `nodes`. For Canvas / - SVG / WebGL/custom-drawn surfaces, prefer `image_xy`: x/y are pixels in - the screenshot attached to the latest `get_app_state` / `app_click`. - Always pass `screenshot_id` from `app_state.screenshot_meta` when present - so the host maps against the exact frame you clicked from. - For board/grid/canvas controls, prefer `image_grid` over raw `image_xy`: - specify the board rectangle in screenshot pixels and a zero-based - `row`/`col`; set `intersections:true` for Go/Gomoku-style line - intersections and `false`/omit it for cell centers. - If the grid rectangle is not known, use `visual_grid`: the host captures - the app, detects the regular visual grid from pixels, then clicks the - requested zero-based row/col using the same captured coordinate basis. - For games / animated WebViews, pass `wait_ms_after` (e.g. 300–600) so the - returned screenshot captures the settled board. - * `build_visual_mark_view { app, opts?: { max_points?, region?, include_grid? } }` - → returns a numbered screenshot grid for arbitrary visual targets that - AX/OCR cannot name (Canvas, games, maps, drawings, icon-only panels). - Use this after `get_app_state` / `build_interactive_view` does not expose - the target. Pass `region` in screenshot pixels to refine into a smaller - area on the next attempt. - * `visual_click { app, i, before_view_digest?, click_count?, mouse_button?, wait_ms_after?, return_view? }` - → clicks the numbered visual mark using the exact screenshot coordinate - basis from the marked view, then returns fresh app state. - * `app_type_text { app, text, focus?: ClickTarget }` — focuses the optional - target first, then types. Honors IME / emoji / CJK via paste-style - injection where the host supports it. - * `app_scroll { app, focus?: ClickTarget, dx, dy }` — pixel deltas inside - the focused scroll container; use negative `dy` to scroll content up. - * `app_key_chord { app, keys:["command","shift","p"], focus_idx? }` — sends - a chord to the app *without* surfacing a global key event; modifier - names match the legacy `key_chord` (command/control/option|alt/shift). - * `app_wait_for { app, predicate, timeout_ms?, poll_ms? }` where - `predicate` is one of `{ kind:"digest_changed", prev_digest }`, - `{ kind:"title_contains", needle }`, - `{ kind:"role_enabled", role, title? }`, `{ kind:"node_enabled", idx }`. - This is the AX equivalent of the `wait` + re-screenshot loop and is - REQUIRED between actions when the next step depends on a state change. -- Selector shape: `{ pid }` is most precise (always survives renames); - `{ bundle_id }` is next-best (survives localization); `{ name }` matches - on the localized window/app name. Combine fields and the host picks the - strongest match. Unresolved selector → `error.code = APP_NOT_FOUND`. -- Stale node refs (e.g. you cached `idx=42` from a snapshot, then the app - re-rendered) → `error.code = AX_NODE_STALE`. Always re-call - `get_app_state` and re-resolve by role/title/identifier — never carry an - `idx` across user-visible mutations without `app_wait_for`. -- If `supports_background_input` is `false` and the host still cannot - silently inject into the target, AX-first writes return - `error.code = BACKGROUND_INPUT_UNAVAILABLE` with a hint pointing at the - legacy foreground click; don't retry without a strategy change. -- Envelope additions for AX-first results: each successful response embeds - `target_app`, `app_state` (text dump), `app_state_nodes` (structured), - `before_digest` (the digest seen *before* the action), `after_digest` (the - digest *after*), and `background_input: bool` so the agent can verify the - action landed without stealing focus. - -#### desktop (legacy screenshot + global pointer) -- screenshot, click_target, move_to_target, click, click_element, mouse_move, - pointer_move_rel, scroll, drag, key_chord, type_text, paste, wait, locate, - move_to_text. -- **`click_target` / `move_to_target`** — preferred mouse primitive for - common "click/move to this visible thing" requests. One call resolves the - target by AX (`node_idx`, text/role/title/identifier filters, or - `target_text`) first, OCR second (`target_text` / `text_query`), and - explicit global `x`/`y` last. This collapses the old locate → move → - guarded-click round-trip into a single authoritative action. -- **`screenshot`** — exactly two possible outputs: the focused application - window (default, via Accessibility) OR the full display (fallback when - AX cannot resolve the window). No crop / quadrant / mouse-centered - options exist anymore. Old crop parameters (`screenshot_crop_center_x/y`, - `screenshot_navigate_quadrant`, `screenshot_reset_navigation`, - `screenshot_implicit_center`, `point_crop_half_extent_native`) are - silently ignored. The only param that still has meaning is - `screenshot_window: true` — and it just reaffirms the default; you - rarely need to pass it. -- **`paste { text, clear_first?, submit?, submit_keys? }`** — STRONGLY PREFER - this over `type_text` for any non-trivial input (CJK, emoji, multi-line, - contact names, message bodies, anything > ~15 chars). Internally does - `clipboard_set` + cmd/ctrl+v, optionally cmd/ctrl+a first to replace - existing content, and optionally Return after to submit. Collapses the - canonical "type a name into search and press enter" / "send a message" - sequence into a single tool call AND avoids every IME failure mode that - `type_text` is subject to. Use `submit_keys: ["command","return"]` for - Slack-style apps where Return inserts a newline. -- `type_text` is a fallback for short Latin-only text into a focused input - where you have no clipboard helper (Linux without wl-clipboard / xclip). - In every other case `paste` is faster and more reliable. -- `key_chord` accepts EITHER `{"keys":["command","v"]}` (canonical) OR a - bare `{"keys":"escape"}` / `{"key":"return"}` for single keys; both - shapes are coerced. Modifier names: command, control, option/alt, shift. -- Multi-display routing (FIRST step on multi-monitor setups): - * `list_displays` — returns every attached screen with `display_id`, - `is_primary`, `is_active`, `has_pointer`, origin/size, and `scale_factor`. - Always inspect this list before issuing screen-coordinate actions when - `interaction_state.displays` has more than one entry; do NOT assume the - cursor is on the screen the user is looking at. - * `focus_display` — `{ display_id }` pins ALL subsequent screenshots / - clicks / locates to that display until cleared. Pass `{ display_id: null }` - (or omit) to fall back to the legacy "screen under the mouse" behavior. - Pinning invalidates any cached screenshot, so the next `screenshot` is - guaranteed to come from the chosen display. -- `interaction_state.displays` and `interaction_state.active_display_id` - are present in every desktop tool result and tell you which display the - next action will target. If that does not match the user's intent, - either call `desktop.focus_display` BEFORE the next `screenshot` / `click`, - OR pass `display_id: ` directly inside the next action's params — - every desktop action accepts it as a one-shot pin equivalent (sticky: - the pin persists for follow-up actions until you set `display_id: null`). -- Single-display setup (most users): you do NOT need `list_displays` / - `focus_display`. Just call `screenshot` / `click_element` / etc. - directly — `interaction_state.displays.length === 1` is your signal. -"# - } else { - r#"### domain: "desktop" -- Not available in this session because Computer Use is disabled. -- Do not attempt mouse, keyboard, OCR, display, or external desktop app control actions. -- To enable these actions, turn on the `computer use` setting in session configuration and use the BitFun desktop app. -"# - }; - - format!( - r#"ControlHub — the SOLE control entry point for everything the agent can drive. - -You will not find a separate `ComputerUse` tool: every desktop, browser, -terminal-signalling and system action is reachable through this one tool -via `{{ domain, action, params }}`. - -## Decision tree — which domain do I use? - -1. The user wants to drive a website / web app in their *real* browser - (preserving cookies, login, extensions)? - → **domain: "browser"** (drives the user's default Chromium-family browser via CDP) - -2. The user wants to operate another desktop application - (third-party app windows, OS dialogs, system-wide keyboard / mouse, accessibility)? - → **domain: "desktop"** (Computer Use: screenshot, click, key_chord, locate, ...) - -3. The user wants to launch an app, run a shell / AppleScript, or query OS info? - → **domain: "system"** - -4. The user wants to signal an existing terminal session - (kill, send SIGINT) — *not* run new commands; for that use the `Bash` tool? - → **domain: "terminal"** + fn description_text() -> String { + r#"ControlHub — the unified control entry point for browser, terminal, and routing metadata. -If you are unsure between two domains: prefer the smallest blast radius -(`browser` < `desktop` < `system`). +Use this tool via `{ domain, action, params }` for browser automation, terminal signalling, and capability/routing introspection. Local computer and operating-system actions have moved out of ControlHub: use the dedicated `ComputerUse` tool/agent for desktop UI control, screenshots, OCR, mouse/keyboard input, app launching, file/url opening, clipboard access, OS facts, and local scripts. -## Unified response envelope +## Domains -Every call returns a JSON object with a stable shape: - - // success - {{ "ok": true, "domain": "...", "action": "...", "data": {{ ... }} }} - // failure (still delivered as a normal tool result, NOT an exception) - {{ "ok": false, "domain": "...", "action": "...", - "error": {{ "code": "STALE_REF" | "NOT_FOUND" | "AMBIGUOUS" | "GUARD_REJECTED" - | "WRONG_DISPLAY" | "WRONG_TAB" | "INVALID_PARAMS" - | "PERMISSION_DENIED" | "TIMEOUT" | "NOT_AVAILABLE" - | "MISSING_SESSION" | "FRONTEND_ERROR" | "INTERNAL" - | "APP_NOT_FOUND" | "AX_NODE_STALE" | "AX_IDX_STALE" - | "AX_IDX_NOT_SUPPORTED" | "DESKTOP_COORD_OUT_OF_DISPLAY" - | "BACKGROUND_INPUT_UNAVAILABLE", - "message": "...", "hints": [ "...next step..." ] }} }} - -Branch on `ok` and on `error.code` deterministically. Never scrape the English `message` -for control flow. - -## Domains and actions - -### domain: "browser" (DOM/CDP-only browser control; never use desktop mouse/keyboard for browser interaction) +### domain: "browser" (DOM/CDP browser control) - Two browser modes: - * `connect {{ mode: "headless" }}` — attach to a headless test browser on the test port for project Web UI testing that does **not** depend on user login state. - * `connect {{ mode: "default" }}` (default) — attach to the user's default browser via CDP for flows that require login state, cookies, extensions, or the user's real profile. -- In **all** browser cases, control the page through DOM/CDP actions only. Do **not** use `domain: "desktop"` mouse/keyboard actions to drive a browser. -- connect, navigate, snapshot, click, fill, type, select, press_key, scroll, wait, - get_text, get_url, get_title, screenshot, evaluate, close, list_pages, tab_query, - switch_page, list_sessions. -- Fast path (target a known tab in ONE call): - * `connect {{ target_url? , target_title? , activate? }}` finds the first - open tab whose URL / title contains the substring, registers it as the - default session AND brings it to the front. Use this instead of - `connect` → `list_pages` → `switch_page` for the common - "drive my Gmail / GitHub PR / docs tab" flow. If the filter matches no - tab you get `error.code = WRONG_TAB` (no silent fallback). -- Tab routing: - * `list_pages` returns every page/tab the browser exposes; each entry - carries `is_default_session` so you can tell which one ControlHub will - drive next without an extra `list_sessions` round-trip. - * `tab_query` (`{{ url_contains?, title_contains?, only_pages?, limit? }}`) - is the preferred filter when you need to inspect candidates before - committing to one. - * `switch_page` (`{{ page_id, activate? }}`) sets the default CDP session - AND, by default, calls `Page.bringToFront` so the user actually sees - the tab being driven. Pass `activate: false` to keep the operation - invisible (e.g. background scraping). -- Workflow: connect → navigate → snapshot (returns @e1, @e2 ... refs) → click/fill using refs. -- `snapshot` now traverses **open shadow roots** and **same-origin iframes**; - each element entry includes `scope` (`document`/`shadow`/`iframe`) and - `frame_path` so you can tell where in the DOM tree it lives. Pass - `with_backend_node_ids: true` to also receive a stable - `backend_node_id` per element (CDP DOM id, survives re-renders). + * `connect { mode: "headless" }` — attach to a headless test browser on the test port for project Web UI testing that does not depend on user login state. + * `connect { mode: "default" }` (default) — attach to the user's default browser via CDP for flows that require login state, cookies, extensions, or the user's real profile. +- Actions: connect, navigate, snapshot, click, fill, type, select, press_key, scroll, wait, get_text, get_url, get_title, screenshot, evaluate, close, list_pages, tab_query, switch_page, list_sessions. +- Workflow: connect -> navigate -> snapshot (returns @e1, @e2 ... refs) -> click/fill using refs. - Take a fresh snapshot after any DOM mutation; stale refs return `error.code = STALE_REF`. -{desktop_domain_doc} ### domain: "terminal" - list_sessions, kill (`terminal_session_id`), interrupt (`terminal_session_id`). - Use the `Bash` tool to *run* commands; this domain only signals existing sessions. -- Fast path: if there is exactly ONE live terminal session, you may omit - `terminal_session_id` and ControlHub will target it automatically. With - zero live sessions you get `error.code = MISSING_SESSION`; with multiple - you get `AMBIGUOUS` plus the candidate ids in `error.hints`. Otherwise - call `list_sessions` first. +- Use the `Bash` tool to run new commands; this domain only signals existing terminal sessions. -### domain: "system" -- open_app (`app_name`), open_url (`url`), open_file (`path`, `app?`), - clipboard_get (`max_bytes?`), clipboard_set (`text`), - run_script (`script`, `script_type` = applescript|shell, optional - `timeout_ms` ≤ 5 min, `max_output_bytes` ≤ 256 KB), get_os_info. -- `open_url` is the right tool when the goal is "show this URL to the user" - (no CDP, no driving). Use `domain: "browser"` only when you actually need - to interact with the page. -- `open_file` opens a local file with its default handler (or an explicit - `app` on macOS) — high-frequency for "open this PDF / picture / spreadsheet". -- `clipboard_get` / `clipboard_set` are the universal cross-app bridge: - the cheapest way to move text between apps that you'd otherwise have to - drive separately. `clipboard_get` returns `{{ text, byte_length, truncated }}`; - `clipboard_set {{ text }}` is the inverse. On Linux this requires - wl-clipboard / xclip / xsel; missing-helper failures return `NOT_AVAILABLE`. -- `run_script` enforces the timeout and truncates large stdout/stderr; on - timeout it returns `error.code = TIMEOUT` and the child process is killed. - `get_os_info` includes `os`, `arch`, `os_version`, `hostname`. +### domain: "meta" +- `capabilities` — returns `{ domains: { browser, terminal, meta }, host: { os, arch }, schema_version }`. +- `route_hint` — maps a free-form intent to the appropriate ControlHub domain, or tells you to use `ComputerUse` for local computer/system/desktop work. -### domain: "meta" (introspection — call this BEFORE long control flows) -- `capabilities` — returns `{{ domains: {{ desktop, browser, terminal, system, meta }}, - host: {{ os, arch }}, schema_version }}`. Use it to confirm which domains are - actually wired up on this runtime instead of guessing from the description. -- `route_hint` (`{{ intent }}`) — heuristic mapping of a free-form user intent - ("把 BitFun 默认模型改成 Kimi") to a ranked list of candidate domains so the - model has a sanity check before it commits to one. Always confirm with - `meta.capabilities` and the domain docs; this is only a hint. +## Unified Response Envelope -## Workflow tips -1. For cross-domain workflows (browser data → desktop paste, system launch → browser attach), - call actions sequentially and verify each step's `ok` field before chaining. -2. After any UI mutation, re-acquire state (browser: snapshot, desktop: screenshot) - before the next action. -3. When the model is the only one driving inputs, `wait` 200–500 ms after a click that - triggers an animation before re-observing."#, - desktop_domain_doc = desktop_domain_doc, - ) +Every call returns a stable JSON shape: + + // success + { "ok": true, "domain": "...", "action": "...", "data": { ... } } + // failure + { "ok": false, "domain": "...", "action": "...", "error": { "code": "...", "message": "...", "hints": ["..."] } } + +Branch on `ok` and `error.code`, not on English messages. +"# + .to_string() } async fn dispatch( @@ -529,27 +133,34 @@ for control flow. ) -> BitFunResult> { match domain { "desktop" => { - if !Self::desktop_domain_enabled().await { - return Ok(err_response( - "desktop", - action, - ControlHubError::new( - ErrorCode::NotAvailable, - "Computer Use is disabled for this session.", - ) - .with_hint( - "Enable computer use in session settings to expose desktop control actions.", - ), - )); - } - self.handle_desktop(action, params, context).await + Ok(err_response( + "desktop", + action, + ControlHubError::new( + ErrorCode::InvalidParams, + "The desktop domain has moved out of ControlHub.", + ) + .with_hint( + "Use the dedicated ComputerUse tool/agent for screenshots, OCR, mouse, keyboard, and desktop app control.", + ), + )) } "browser" => self.handle_browser(action, params).await, "terminal" => self.handle_terminal(action, params, context).await, - "system" => self.handle_system(action, params, context).await, + "system" => Ok(err_response( + "system", + action, + ControlHubError::new( + ErrorCode::InvalidParams, + "The system domain has moved out of ControlHub.", + ) + .with_hint( + "Use the dedicated ComputerUse tool/agent for open_app, open_url, open_file, clipboard, OS info, and local scripts.", + ), + )), "meta" => self.handle_meta(action, params, context).await, other => Err(BitFunError::tool(format!( - "Unknown domain: '{}'. Valid domains: desktop, browser, terminal, system, meta", + "Unknown domain: '{}'. Valid ControlHub domains: browser, terminal, meta. Use ComputerUse for desktop/system actions.", other ))), } @@ -571,14 +182,13 @@ for control flow. ) -> BitFunResult> { match action { "capabilities" => { - let desktop_available = Self::desktop_domain_enabled().await; // `terminal` (TerminalApi) is delivered through a global // registry rather than a field on the context, so we can't be // 100% sure here without round-tripping. We report "likely - // available iff desktop is available" because that bridge only - // exists in BitFun's desktop runtime; the actual call will + // available iff a desktop host is present" because that bridge + // only exists in BitFun's desktop runtime; the actual call will // surface a clean error if the bridge is offline. - let likely_terminal_available = desktop_available; + let likely_terminal_available = context.computer_use_host.is_some(); let browser_default = browser_sessions().default_id().await; let browser_session_count = browser_sessions().list().await.len(); let os = std::env::consts::OS; @@ -603,18 +213,18 @@ for control flow. // Same script_types probe as get_os_info — duplicated here // because callers often hit `meta.capabilities` first and we // don't want to force an extra system round-trip. - let mut script_types: Vec<&'static str> = vec!["shell"]; + let mut _script_types: Vec<&'static str> = vec!["shell"]; if cfg!(target_os = "macos") { - script_types.push("applescript"); + _script_types.push("applescript"); } if which_exists("bash") { - script_types.push("bash"); + _script_types.push("bash"); } if which_exists("pwsh") || which_exists("powershell") { - script_types.push("powershell"); + _script_types.push("powershell"); } if cfg!(target_os = "windows") { - script_types.push("cmd"); + _script_types.push("cmd"); } #[cfg(target_os = "linux")] @@ -625,30 +235,8 @@ for control flow. Option, ) = (None, None); - let desktop_host = context.computer_use_host.as_ref(); - let desktop_ax_tree = desktop_host - .map(|host| host.supports_ax_tree()) - .unwrap_or(false); - let desktop_background_input = desktop_host - .map(|host| host.supports_background_input()) - .unwrap_or(false); - let desktop_interactive_view = desktop_host - .map(|host| host.supports_interactive_view()) - .unwrap_or(false); - let desktop_visual_mark_view = desktop_host - .map(|host| host.supports_visual_mark_view()) - .unwrap_or(false); - let body = json!({ "domains": { - "desktop": { - "available": desktop_available, - "reason": if desktop_available { Value::Null } else { json!("Only available in the BitFun desktop app") }, - "supports_ax_tree": desktop_ax_tree, - "supports_background_input": desktop_background_input, - "supports_interactive_view": desktop_interactive_view, - "supports_visual_mark_view": desktop_visual_mark_view, - }, "browser": { "available": true, "default_session_id": browser_default, @@ -657,10 +245,6 @@ for control flow. "cdp_supported": browser_cdp_supported, }, "terminal": { "available": likely_terminal_available, "reason": if likely_terminal_available { Value::Null } else { json!("TerminalApi is only available in contexts that registered it") } }, - "system": { - "available": true, - "script_types": script_types, - }, "meta": { "available": true }, }, "host": { @@ -743,9 +327,9 @@ for control flow. if lower.contains(kw) { push( &mut suggestions, - "desktop", + "ComputerUse", 75, - "Matches third-party desktop window keywords", + "Matches local desktop/system keywords; use the ComputerUse tool/agent", ); break; } @@ -763,7 +347,12 @@ for control flow. } for kw in system_kw { if lower.contains(kw) { - push(&mut suggestions, "system", 70, "Matches OS/launch keywords"); + push( + &mut suggestions, + "ComputerUse", + 70, + "Matches OS/launch keywords; use the ComputerUse tool/agent", + ); break; } } @@ -794,2628 +383,766 @@ for control flow. } } - // ── Desktop domain ───────────────────────────────────────────────── + async fn handle_browser(&self, action: &str, params: &Value) -> BitFunResult> { + let port = params + .get("port") + .and_then(|v| v.as_u64()) + .map(|p| p as u16) + .unwrap_or(DEFAULT_CDP_PORT); - async fn handle_desktop( - &self, - action: &str, - params: &Value, - context: &ToolUseContext, - ) -> BitFunResult> { - let host = context.computer_use_host.as_ref().ok_or_else(|| { - BitFunError::tool( - "Desktop control is only available in the BitFun desktop app".to_string(), - ) - })?; + let session_id_param = params + .get("session_id") + .and_then(|v| v.as_str()) + .map(str::to_string); - // Phase 2: handle multi-display routing actions directly. Going - // through the legacy ComputerUseTool dispatch isn't useful here - // because there is no equivalent action there, and we want these - // to be first-class ControlHub primitives so the model can pin a - // target display before any screenshot/click flow. match action { - "list_displays" => { - let displays = host.list_displays().await?; - let active = host.focused_display_id(); - let count = displays.len(); - return Ok(vec![ToolResult::ok( - json!({ - "displays": displays, - "active_display_id": active, - }), - Some(format!("{} display(s) detected", count)), - )]); - } - // High-leverage UX primitive: paste arbitrary text into the - // currently focused input via the system clipboard, optionally - // clearing first and submitting after. This collapses the - // canonical IM/search flow: - // - // clipboard_set + key_chord(cmd+v) + key_chord(return) - // - // ...into a single tool call. It is also the **only** robust way - // to enter CJK / emoji / multi-line text — `type_text` goes - // through the per-character key path and is at the mercy of - // every IME on the host. This is exactly the pattern Codex - // uses (`pbcopy` + cmd+v) to keep WeChat / iMessage flows - // smooth. - // - // Params: - // - text (required) — text to paste - // - clear_first (bool, default false) — cmd+a before paste, - // so the new text REPLACES whatever was there - // - submit (bool, default false) — press Return after - // paste; switches to "send the message" mode - // - submit_keys (array, default ["return"]) — override the - // submit chord (e.g. ["command","return"] for - // Slack / multi-line apps) - // - // Returns the same envelope as a `key_chord` so the model can - // chain a verification screenshot exactly as before. - "paste" => { - let text = params - .get("text") - .and_then(|v| v.as_str()) - .ok_or_else(|| { - BitFunError::tool( - "[INVALID_PARAMS] desktop.paste requires 'text'\nHints: example { \"action\":\"paste\", \"text\":\"hello\", \"submit\":true }" - .to_string(), - ) - })?; - let clear_first = params - .get("clear_first") - .and_then(|v| v.as_bool()) - .unwrap_or(false); - let submit = params - .get("submit") - .and_then(|v| v.as_bool()) - .unwrap_or(false); - let submit_keys: Vec = match params.get("submit_keys") { - Some(Value::Array(arr)) => arr - .iter() - .filter_map(|v| v.as_str().map(|s| s.to_string())) - .collect(), - Some(Value::String(s)) => vec![s.to_string()], - _ => vec!["return".to_string()], - }; - - if let Err(e) = clipboard_write(text).await { - return Ok(err_response( - "desktop", - "paste", - ControlHubError::new( - ErrorCode::NotAvailable, - format!("Clipboard write failed: {}", e), - ) - .with_hint( - "Fall back to type_text or check that wl-clipboard / xclip is installed (Linux only)", - ), - )); - } - - let paste_chord = match std::env::consts::OS { - "macos" => vec!["command".to_string(), "v".to_string()], - _ => vec!["control".to_string(), "v".to_string()], - }; + "connect" => { + let mode = Self::browser_connect_mode_from_params(params); + let kind = BrowserLauncher::detect_default_browser()?; - if clear_first { - let select_all = match std::env::consts::OS { - "macos" => vec!["command".to_string(), "a".to_string()], - _ => vec!["control".to_string(), "a".to_string()], - }; - host.key_chord(select_all).await?; - } - host.key_chord(paste_chord).await?; - if submit { - host.computer_use_trust_pointer_after_text_input(); - host.key_chord(submit_keys.clone()).await?; + if mode == "headless" { + if !BrowserLauncher::is_cdp_available(port).await { + return Ok(err_response( + "browser", + "connect", + ControlHubError::new( + ErrorCode::NotAvailable, + format!( + "Headless browser test port {} is not available. Start the dedicated headless browser first, then connect via ControlHub browser actions.", + port + ), + ) + .with_hints(Self::headless_browser_connect_hints(port)), + )); + } } - let summary = match (clear_first, submit) { - (false, false) => format!("Pasted {} chars", text.chars().count()), - (true, false) => { - format!("Replaced focused field with {} chars", text.chars().count()) - } - (false, true) => format!("Pasted {} chars and submitted", text.chars().count()), - (true, true) => { - format!("Replaced + submitted ({} chars)", text.chars().count()) - } + let user_data_dir = params.get("user_data_dir").and_then(|v| v.as_str()); + let launch_result = if mode == "headless" { + LaunchResult::AlreadyConnected + } else { + BrowserLauncher::launch_with_cdp_opts(&kind, port, user_data_dir).await? }; - return Ok(vec![ToolResult::ok( - json!({ - "success": true, - "action": "paste", - "char_count": text.chars().count(), - "byte_length": text.len(), - "clear_first": clear_first, - "submitted": submit, - "submit_keys": if submit { Some(submit_keys) } else { None }, - }), - Some(summary), - )]); - } - // ── AX-first actions (Codex parity) ─────────────────────── - // These bypass the legacy ComputerUseTool because they - // operate on the new typed AppSelector / AxNode envelope. - "list_apps" - | "get_app_state" - | "app_click" - | "app_type_text" - | "app_scroll" - | "app_key_chord" - | "app_wait_for" - | "build_interactive_view" - | "interactive_click" - | "interactive_type_text" - | "interactive_scroll" - | "build_visual_mark_view" - | "visual_click" => { - return self.handle_desktop_ax(host, action, params).await; - } - "focus_display" => { - // Accept `null` (or omitted `display_id`) to clear the pin - // and fall back to "screen under the pointer". An explicit - // numeric id pins that display until cleared. - let display_id = match params.get("display_id") { - Some(Value::Null) | None => None, - Some(v) => Some(v.as_u64().ok_or_else(|| { - BitFunError::tool( - "focus_display: 'display_id' must be a non-negative integer or null" - .to_string(), - ) - })? as u32), - }; - host.focus_display(display_id).await?; - let displays = host.list_displays().await?; - let summary = match display_id { - Some(id) => format!("Pinned display {}", id), - None => "Cleared display pin (will follow mouse)".to_string(), - }; - return Ok(vec![ToolResult::ok( - json!({ - "active_display_id": display_id, - "displays": displays, - }), - Some(summary), - )]); - } - _ => {} - } + // UX shortcut: a frequent flow is "drive my Gmail tab" / + // "drive the GitHub PR I'm looking at". Without `target_*` + // the model needed `connect` → `list_pages` → `switch_page` + // (3 round-trips and one chance to pick the wrong id). With + // `target_url` / `target_title` we collapse those into a + // single `connect` call: pick the first page whose URL or + // title contains the substring, register it as the default + // session, and bring it to the front. + let target_url = params + .get("target_url") + .and_then(|v| v.as_str()) + .map(str::to_lowercase); + let target_title = params + .get("target_title") + .and_then(|v| v.as_str()) + .map(str::to_lowercase); + let activate = params + .get("activate") + .and_then(|v| v.as_bool()) + .unwrap_or(true); - if let Some(err) = self.desktop_action_targets_browser(action, context).await { - return Ok(err_response("desktop", action, err)); - } + match &launch_result { + LaunchResult::AlreadyConnected | LaunchResult::Launched => { + let pages = CdpClient::list_pages(port).await?; + let connected_browser = if mode == "headless" { + "Headless test browser".to_string() + } else { + kind.to_string() + }; - // UX shortcut: every screen-coordinate action accepts an optional - // `display_id`. If present (and different from the currently pinned - // display), pin it BEFORE forwarding so the model doesn't need a - // separate `focus_display` round-trip. Pin is sticky — subsequent - // actions on the same screen don't need to re-specify. Pass - // `display_id: null` to clear the pin in the same call. - if let Some(v) = params.get("display_id") { - let target = match v { - Value::Null => None, - v => Some(v.as_u64().ok_or_else(|| { - BitFunError::tool( - "display_id must be a non-negative integer or null".to_string(), - ) - })? as u32), - }; - if host.focused_display_id() != target { - host.focus_display(target).await?; - } - } + // Selection: explicit target_* > first real page > first. + let matched_by_target = if target_url.is_some() || target_title.is_some() { + pages.iter().find(|p| { + if p.web_socket_debugger_url.is_none() { + return false; + } + let url_ok = target_url + .as_ref() + .map(|n| p.url.to_lowercase().contains(n)) + .unwrap_or(true); + let title_ok = target_title + .as_ref() + .map(|n| p.title.to_lowercase().contains(n)) + .unwrap_or(true); + p.page_type.as_deref() == Some("page") && url_ok && title_ok + }) + } else { + None + }; - let mut cu_input = params.clone(); - if let Value::Object(ref mut map) = cu_input { - map.insert("action".to_string(), json!(action)); - // Strip the ControlHub-only field so the legacy ComputerUseTool - // doesn't trip on an unrecognised parameter. - map.remove("display_id"); - } + // Tell the model when its filter found nothing instead + // of silently falling back to the first tab and + // confusing the next action. + if (target_url.is_some() || target_title.is_some()) + && matched_by_target.is_none() + { + return Ok(err_response( + "browser", + "connect", + ControlHubError::new( + ErrorCode::WrongTab, + format!( + "No open tab matched target_url={:?} target_title={:?}", + target_url, target_title + ), + ) + .with_hints([ + "Call browser.list_pages or browser.tab_query first to inspect open tabs", + "Loosen the substring (e.g. domain only) and try again", + ]), + )); + } - let cu_tool = super::computer_use_tool::ComputerUseTool::new(); - cu_tool.call_impl(&cu_input, context).await - } + let page = matched_by_target + .or_else(|| { + pages.iter().find(|p| { + p.page_type.as_deref() == Some("page") + && p.web_socket_debugger_url.is_some() + }) + }) + .or_else(|| pages.first()) + .ok_or_else(|| { + BitFunError::tool("No browser pages found via CDP".to_string()) + })?; + let ws_url = page.web_socket_debugger_url.as_ref().ok_or_else(|| { + BitFunError::tool("Page has no WebSocket debugger URL".to_string()) + })?; + let client = CdpClient::connect(ws_url).await?; + let version = CdpClient::get_version(port).await?; + let session = BrowserSession { + session_id: page.id.clone(), + port, + client: Arc::new(client), + }; + browser_sessions().register(session.clone()).await; - // ── Desktop AX-first dispatch (Codex parity) ────────────────────── - // Routes the seven new app-targeted actions through the typed - // `ComputerUseHost` API. Every successful response carries a - // unified envelope: `target_app`, `background_input`, - // `before_digest` and (for state queries) `app_state` / - // `app_state_nodes` so the model can reason about the AX tree - // before/after each action without re-querying. - async fn handle_desktop_ax( - &self, - host: &ComputerUseHostRef, - action: &str, - params: &Value, - ) -> BitFunResult> { - // ── Helpers ───────────────────────────────────────────────── - fn parse_selector(v: &Value) -> BitFunResult { - let obj = v.get("app").ok_or_else(|| { - BitFunError::tool( - "[INVALID_PARAMS] missing 'app' selector (pid|bundle_id|name)".to_string(), - ) - })?; - let sel: AppSelector = serde_json::from_value(obj.clone()).map_err(|e| { - BitFunError::tool(format!( - "[INVALID_PARAMS] bad 'app' selector: {} (expect {{pid|bundle_id|name}})", - e - )) - })?; - if sel.pid.is_none() && sel.bundle_id.is_none() && sel.name.is_none() { - return Err(BitFunError::tool( - "[INVALID_PARAMS] 'app' must include at least one of pid|bundle_id|name" - .to_string(), - )); - } - Ok(sel) - } - - fn parse_click_target(v: &Value) -> BitFunResult { - if v.get("kind").is_some() { - return serde_json::from_value(v.clone()).map_err(|e| { - BitFunError::tool(format!( - "[INVALID_PARAMS] bad ClickTarget: {} (expected {{\"kind\":\"node_idx\",\"idx\":N}}, {{\"kind\":\"image_xy\",\"x\":0,\"y\":0}}, {{\"kind\":\"image_grid\",\"x0\":0,\"y0\":0,\"width\":300,\"height\":300,\"rows\":15,\"cols\":15,\"row\":7,\"col\":7,\"intersections\":true}}, {{\"kind\":\"visual_grid\",\"rows\":15,\"cols\":15,\"row\":7,\"col\":7,\"intersections\":true}}, {{\"kind\":\"screen_xy\",\"x\":0,\"y\":0}}, or {{\"kind\":\"ocr_text\",\"needle\":\"...\"}})", - e - )) - }); - } - if let Some(idx) = v.get("node_idx").and_then(|x| x.as_u64()) { - return Ok(ClickTarget::NodeIdx { idx: idx as u32 }); - } - if let Some(obj) = v.get("screen_xy") { - let x = obj.get("x").and_then(|x| x.as_f64()).ok_or_else(|| { - BitFunError::tool( - "[INVALID_PARAMS] screen_xy target requires numeric x".to_string(), - ) - })?; - let y = obj.get("y").and_then(|y| y.as_f64()).ok_or_else(|| { - BitFunError::tool( - "[INVALID_PARAMS] screen_xy target requires numeric y".to_string(), - ) - })?; - return Ok(ClickTarget::ScreenXy { x, y }); - } - if let Some(obj) = v.get("image_xy") { - let x = obj.get("x").and_then(|x| x.as_i64()).ok_or_else(|| { - BitFunError::tool( - "[INVALID_PARAMS] image_xy target requires integer x".to_string(), - ) - })?; - let y = obj.get("y").and_then(|y| y.as_i64()).ok_or_else(|| { - BitFunError::tool( - "[INVALID_PARAMS] image_xy target requires integer y".to_string(), - ) - })?; - return Ok(ClickTarget::ImageXy { - x: x as i32, - y: y as i32, - screenshot_id: obj - .get("screenshot_id") - .and_then(|v| v.as_str()) - .map(|s| s.to_string()), - }); - } - if let Some(obj) = v.get("image_grid") { - let target = json!({ - "kind": "image_grid", - "x0": obj.get("x0").cloned().unwrap_or(Value::Null), - "y0": obj.get("y0").cloned().unwrap_or(Value::Null), - "width": obj.get("width").cloned().unwrap_or(Value::Null), - "height": obj.get("height").cloned().unwrap_or(Value::Null), - "rows": obj.get("rows").cloned().unwrap_or(Value::Null), - "cols": obj.get("cols").cloned().unwrap_or(Value::Null), - "row": obj.get("row").cloned().unwrap_or(Value::Null), - "col": obj.get("col").cloned().unwrap_or(Value::Null), - "intersections": obj.get("intersections").cloned().unwrap_or(json!(false)), - "screenshot_id": obj.get("screenshot_id").cloned().unwrap_or(Value::Null), - }); - return serde_json::from_value(target).map_err(|e| { - BitFunError::tool(format!( - "[INVALID_PARAMS] bad image_grid target: {} (need x0,y0,width,height,rows,cols,row,col; optional intersections)", - e - )) - }); - } - if let Some(obj) = v.get("visual_grid") { - let target = json!({ - "kind": "visual_grid", - "rows": obj.get("rows").cloned().unwrap_or(Value::Null), - "cols": obj.get("cols").cloned().unwrap_or(Value::Null), - "row": obj.get("row").cloned().unwrap_or(Value::Null), - "col": obj.get("col").cloned().unwrap_or(Value::Null), - "intersections": obj.get("intersections").cloned().unwrap_or(json!(false)), - "wait_ms_after_detection": obj.get("wait_ms_after_detection").cloned().unwrap_or(Value::Null), - }); - return serde_json::from_value(target).map_err(|e| { - BitFunError::tool(format!( - "[INVALID_PARAMS] bad visual_grid target: {} (need rows,cols,row,col; optional intersections)", - e - )) - }); - } - if v.get("x").is_some() || v.get("y").is_some() { - let x = v.get("x").and_then(|x| x.as_f64()).ok_or_else(|| { - BitFunError::tool( - "[INVALID_PARAMS] screen target requires numeric x".to_string(), - ) - })?; - let y = v.get("y").and_then(|y| y.as_f64()).ok_or_else(|| { - BitFunError::tool( - "[INVALID_PARAMS] screen target requires numeric y".to_string(), - ) - })?; - return Ok(ClickTarget::ScreenXy { x, y }); - } - if let Some(ocr) = v.get("ocr_text") { - let needle = ocr - .get("needle") - .or_else(|| ocr.get("text")) - .and_then(|x| x.as_str()) - .ok_or_else(|| { - BitFunError::tool( - "[INVALID_PARAMS] ocr_text target requires needle".to_string(), - ) - })?; - return Ok(ClickTarget::OcrText { - needle: needle.to_string(), - }); - } - Err(BitFunError::tool( - "[INVALID_PARAMS] unsupported ClickTarget. Use {\"kind\":\"node_idx\",\"idx\":N}, {\"node_idx\":N}, {\"kind\":\"image_xy\",\"x\":0,\"y\":0}, {\"image_xy\":{\"x\":0,\"y\":0}}, {\"kind\":\"image_grid\",\"x0\":0,\"y0\":0,\"width\":300,\"height\":300,\"rows\":15,\"cols\":15,\"row\":7,\"col\":7,\"intersections\":true}, {\"kind\":\"visual_grid\",\"rows\":15,\"cols\":15,\"row\":7,\"col\":7,\"intersections\":true}, {\"kind\":\"screen_xy\",\"x\":0,\"y\":0}, or {\"ocr_text\":{\"needle\":\"...\"}}.".to_string(), - )) - } - - fn parse_wait_predicate(v: &Value) -> BitFunResult { - if v.get("kind").is_some() { - return serde_json::from_value(v.clone()).map_err(|e| { - BitFunError::tool(format!( - "[INVALID_PARAMS] bad app_wait_for predicate: {}", - e - )) - }); - } - if let Some(obj) = v.get("digest_changed") { - let prev_digest = obj - .get("prev_digest") - .or_else(|| obj.get("from")) - .and_then(|x| x.as_str()) - .ok_or_else(|| { - BitFunError::tool( - "[INVALID_PARAMS] digest_changed requires prev_digest".to_string(), - ) - })?; - return Ok(AppWaitPredicate::DigestChanged { - prev_digest: prev_digest.to_string(), - }); - } - if let Some(obj) = v.get("title_contains") { - let needle = obj - .get("needle") - .or_else(|| obj.get("title")) - .and_then(|x| x.as_str()) - .or_else(|| obj.as_str()) - .ok_or_else(|| { - BitFunError::tool( - "[INVALID_PARAMS] title_contains requires needle".to_string(), - ) - })?; - return Ok(AppWaitPredicate::TitleContains { - needle: needle.to_string(), - }); - } - if let Some(obj) = v.get("role_enabled") { - let role = obj.get("role").and_then(|x| x.as_str()).ok_or_else(|| { - BitFunError::tool("[INVALID_PARAMS] role_enabled requires role".to_string()) - })?; - return Ok(AppWaitPredicate::RoleEnabled { - role: role.to_string(), - }); - } - if let Some(obj) = v.get("node_enabled") { - let idx = obj - .get("idx") - .and_then(|x| x.as_u64()) - .or_else(|| obj.as_u64()) - .ok_or_else(|| { - BitFunError::tool("[INVALID_PARAMS] node_enabled requires idx".to_string()) - })?; - return Ok(AppWaitPredicate::NodeEnabled { idx: idx as u32 }); - } - Err(BitFunError::tool( - "[INVALID_PARAMS] unsupported app_wait_for predicate. Use {\"kind\":\"digest_changed\",\"prev_digest\":\"...\"} or shorthand {\"digest_changed\":{\"prev_digest\":\"...\"}}.".to_string(), - )) - } - - fn parse_keys(v: &Value) -> Vec { - match v.get("keys").or_else(|| v.get("key")) { - Some(Value::Array(arr)) => arr - .iter() - .filter_map(|x| x.as_str().map(|s| s.to_string())) - .collect(), - Some(Value::String(s)) => vec![s.to_string()], - _ => Vec::new(), - } - } + // If the model targeted a specific tab AND wants it + // foregrounded (default), bring it to front the same + // way switch_page does. Failure here is non-fatal — + // we still return the connected session. + let mut activated = false; + let mut activate_warning: Option = None; + let targeted = matched_by_target.is_some(); + if targeted && activate { + match session.client.send("Page.bringToFront", None).await { + Ok(_) => activated = true, + Err(e) => { + activate_warning = Some(format!( + "Page.bringToFront failed: {} (session is connected, but the tab is not in the foreground)", + e + )); + } + } + } - // Build the JSON view of an AppStateSnapshot for the model. Excludes - // the heavy `screenshot` payload (it is attached out-of-band as a - // multimodal image, not as base64 inside the JSON tree, to keep token - // budgets under control and let the provider deliver it as `image_url`). - fn snap_state_json( - snap: &crate::agentic::tools::computer_use_host::AppStateSnapshot, - ) -> serde_json::Value { - let mut v = json!({ - "app": snap.app, - "window_title": snap.window_title, - "digest": snap.digest, - "captured_at_ms": snap.captured_at_ms, - "tree_text": snap.tree_text, - "has_screenshot": snap.screenshot.is_some(), - }); - if let Some(shot) = snap.screenshot.as_ref() { - if let Some(obj) = v.as_object_mut() { - let meta: serde_json::Value = json!({ - "image_width": shot.image_width, - "image_height": shot.image_height, - "screenshot_id": shot.screenshot_id, - "native_width": shot.native_width, - "native_height": shot.native_height, - "vision_scale": shot.vision_scale, - "mime_type": shot.mime_type, - "image_content_rect": shot.image_content_rect, - "image_global_bounds": shot.image_global_bounds, - "coordinate_hint": "For visual surfaces, click pixels in this attached image with app_click target {kind:\"image_xy\", x, y, screenshot_id}. For known boards/grids/canvases, prefer {kind:\"image_grid\", x0, y0, width, height, rows, cols, row, col, intersections, screenshot_id}. If the grid rectangle is unknown, use {kind:\"visual_grid\", rows, cols, row, col, intersections}; the host detects the grid from app pixels.", + let mut result = json!({ + "success": true, + "browser": connected_browser, + "browser_mode": mode, + "browser_version": version.browser, + "port": port, + "session_id": session.session_id, + "page_url": page.url, + "page_title": page.title, + "matched_by_target": targeted, + "activated": activated, + "status": if mode == "headless" { + "attached" + } else if matches!(launch_result, LaunchResult::AlreadyConnected) { + "already_connected" + } else { + "launched" + }, }); - obj.insert("screenshot_meta".to_string(), meta); - } - } - v - } - - // Helper: build a `ToolResult` that *also* carries the focused-window - // screenshot as an Anthropic-style multimodal image attachment. When - // the host couldn't (or chose not to) capture, fall back to a regular - // text-only `ToolResult::ok`. - fn snap_result( - data: serde_json::Value, - summary: Option, - snap: &crate::agentic::tools::computer_use_host::AppStateSnapshot, - ) -> ToolResult { - use base64::Engine as _; - if let Some(shot) = snap.screenshot.as_ref() { - let attach = crate::util::types::ToolImageAttachment { - mime_type: shot.mime_type.clone(), - data_base64: base64::engine::general_purpose::STANDARD.encode(&shot.bytes), - }; - ToolResult::ok_with_images(data, summary, vec![attach]) - } else { - ToolResult::ok(data, summary) - } - } - - // Build a JSON view of an InteractiveView that excludes the heavy - // `screenshot.bytes` payload (the JPEG is attached out-of-band as a - // multimodal image attachment, not as base64 inside the tree). - fn build_interactive_view_json( - view: &crate::agentic::tools::computer_use_host::InteractiveView, - ) -> serde_json::Value { - let mut v = json!({ - "app": view.app, - "window_title": view.window_title, - "digest": view.digest, - "captured_at_ms": view.captured_at_ms, - "elements": view.elements, - "tree_text": view.tree_text, - "loop_warning": view.loop_warning, - "has_screenshot": view.screenshot.is_some(), - }); - if let Some(shot) = view.screenshot.as_ref() { - if let Some(obj) = v.as_object_mut() { - obj.insert( - "screenshot_meta".to_string(), - json!({ - "image_width": shot.image_width, - "image_height": shot.image_height, - "screenshot_id": shot.screenshot_id, - "native_width": shot.native_width, - "native_height": shot.native_height, - "vision_scale": shot.vision_scale, - "mime_type": shot.mime_type, - "image_content_rect": shot.image_content_rect, - "image_global_bounds": shot.image_global_bounds, - "coordinate_hint": "Numbered overlays are in JPEG image-pixel space. Reference elements via their `i` index using interactive_click / interactive_type_text / interactive_scroll. For pointer-only fallback, pass screenshot_id with image_xy/image_grid.", - }), - ); + if let Some(w) = activate_warning { + result["warning"] = json!(w); + } + let summary = if targeted { + format!( + "Connected to {} via DOM/CDP (session {}, page '{}')", + connected_browser, session.session_id, page.title + ) + } else { + format!( + "Connected to {} on test port {} via DOM/CDP (session {})", + connected_browser, port, session.session_id + ) + }; + Ok(vec![ToolResult::ok(result, Some(summary))]) + } + LaunchResult::LaunchedButCdpNotReady { message, .. } => Ok(err_response( + "browser", + "connect", + ControlHubError::new(ErrorCode::Timeout, message.clone()) + .with_hints(Self::default_browser_connect_hints(&kind, port)), + )), + LaunchResult::BrowserRunningWithoutCdp { instructions, .. } => Ok(err_response( + "browser", + "connect", + ControlHubError::new( + ErrorCode::NotAvailable, + "The user's default browser is running without the test port enabled.", + ) + .with_hint(instructions) + .with_hints(Self::default_browser_connect_hints(&kind, port)), + )), } } - v - } - fn build_visual_mark_view_json( - view: &crate::agentic::tools::computer_use_host::VisualMarkView, - ) -> serde_json::Value { - let mut v = json!({ - "app": view.app, - "window_title": view.window_title, - "digest": view.digest, - "captured_at_ms": view.captured_at_ms, - "marks": view.marks, - "has_screenshot": view.screenshot.is_some(), - }); - if let Some(shot) = view.screenshot.as_ref() { - if let Some(obj) = v.as_object_mut() { - obj.insert( - "screenshot_meta".to_string(), + "list_pages" => { + let pages = CdpClient::list_pages(port).await?; + let default_id = browser_sessions().default_id().await; + let summary: Vec = pages + .iter() + .map(|p| { json!({ - "image_width": shot.image_width, - "image_height": shot.image_height, - "screenshot_id": shot.screenshot_id, - "native_width": shot.native_width, - "native_height": shot.native_height, - "vision_scale": shot.vision_scale, - "mime_type": shot.mime_type, - "image_content_rect": shot.image_content_rect, - "image_global_bounds": shot.image_global_bounds, - "coordinate_hint": "Numbered visual marks are in JPEG image-pixel space. Reference marks via their `i` index using visual_click. To refine a dense area, call build_visual_mark_view again with opts.region in these screenshot pixels.", - }), - ); - } - } - v - } - - // Build a JSON envelope for interactive_* action results. Includes - // the post-action AppStateSnapshot (without screenshot bytes) and, - // when present, the rebuilt InteractiveView. - fn build_interactive_action_json( - app: &crate::agentic::tools::computer_use_host::AppSelector, - res: &crate::agentic::tools::computer_use_host::InteractiveActionResult, - extras: serde_json::Value, - ) -> serde_json::Value { - let mut v = json!({ - "target_app": app, - "app_state": snap_state_json(&res.snapshot), - "app_state_nodes": res.snapshot.nodes, - "loop_warning": res.snapshot.loop_warning, - "execution_note": res.execution_note, - "interactive_view": res.view.as_ref().map(build_interactive_view_json), - }); - if let (Some(obj), Some(extras_obj)) = (v.as_object_mut(), extras.as_object()) { - for (k, val) in extras_obj { - obj.insert(k.clone(), val.clone()); - } - } - v - } - - fn build_visual_action_json( - app: &crate::agentic::tools::computer_use_host::AppSelector, - res: &crate::agentic::tools::computer_use_host::VisualActionResult, - extras: serde_json::Value, - ) -> serde_json::Value { - let mut v = json!({ - "target_app": app, - "app_state": snap_state_json(&res.snapshot), - "app_state_nodes": res.snapshot.nodes, - "loop_warning": res.snapshot.loop_warning, - "execution_note": res.execution_note, - "visual_mark_view": res.view.as_ref().map(build_visual_mark_view_json), - }); - if let (Some(obj), Some(extras_obj)) = (v.as_object_mut(), extras.as_object()) { - for (k, val) in extras_obj { - obj.insert(k.clone(), val.clone()); - } - } - v - } - - // Attach the InteractiveView's annotated screenshot (if present) - // as a multimodal image; otherwise fall back to text-only ok. - fn interactive_view_result( - data: serde_json::Value, - summary: Option, - view: &crate::agentic::tools::computer_use_host::InteractiveView, - ) -> ToolResult { - use base64::Engine as _; - if let Some(shot) = view.screenshot.as_ref() { - let attach = crate::util::types::ToolImageAttachment { - mime_type: shot.mime_type.clone(), - data_base64: base64::engine::general_purpose::STANDARD.encode(&shot.bytes), - }; - ToolResult::ok_with_images(data, summary, vec![attach]) - } else { - ToolResult::ok(data, summary) - } - } - - fn visual_mark_view_result( - data: serde_json::Value, - summary: Option, - view: &crate::agentic::tools::computer_use_host::VisualMarkView, - ) -> ToolResult { - use base64::Engine as _; - if let Some(shot) = view.screenshot.as_ref() { - let attach = crate::util::types::ToolImageAttachment { - mime_type: shot.mime_type.clone(), - data_base64: base64::engine::general_purpose::STANDARD.encode(&shot.bytes), - }; - ToolResult::ok_with_images(data, summary, vec![attach]) - } else { - ToolResult::ok(data, summary) - } - } - - // Prefer attaching the rebuilt interactive view's screenshot when - // available; otherwise fall back to the post-action snapshot's. - fn interactive_action_result( - data: serde_json::Value, - summary: Option, - res: &crate::agentic::tools::computer_use_host::InteractiveActionResult, - ) -> ToolResult { - use base64::Engine as _; - let shot_opt = res - .view - .as_ref() - .and_then(|v| v.screenshot.as_ref()) - .or(res.snapshot.screenshot.as_ref()); - if let Some(shot) = shot_opt { - let attach = crate::util::types::ToolImageAttachment { - mime_type: shot.mime_type.clone(), - data_base64: base64::engine::general_purpose::STANDARD.encode(&shot.bytes), - }; - ToolResult::ok_with_images(data, summary, vec![attach]) - } else { - ToolResult::ok(data, summary) - } - } - - fn visual_action_result( - data: serde_json::Value, - summary: Option, - res: &crate::agentic::tools::computer_use_host::VisualActionResult, - ) -> ToolResult { - use base64::Engine as _; - let shot_opt = res - .view - .as_ref() - .and_then(|v| v.screenshot.as_ref()) - .or(res.snapshot.screenshot.as_ref()); - if let Some(shot) = shot_opt { - let attach = crate::util::types::ToolImageAttachment { - mime_type: shot.mime_type.clone(), - data_base64: base64::engine::general_purpose::STANDARD.encode(&shot.bytes), - }; - ToolResult::ok_with_images(data, summary, vec![attach]) - } else { - ToolResult::ok(data, summary) - } - } - - let bg = host.supports_background_input(); - let ax = host.supports_ax_tree(); - - match action { - "list_apps" => { - let include_hidden = params - .get("include_hidden") - .and_then(|v| v.as_bool()) - .unwrap_or_else(|| { - !params - .get("only_visible") - .and_then(|v| v.as_bool()) - .unwrap_or(true) - }); - let apps = host.list_apps(include_hidden).await?; - let n = apps.len(); + "id": p.id, + "title": p.title, + "url": p.url, + "type": p.page_type, + "is_default_session": Some(&p.id) == default_id.as_ref(), + }) + }) + .collect(); Ok(vec![ToolResult::ok( json!({ - "apps": apps, - "include_hidden": include_hidden, - "background_input": bg, - "ax_tree": ax, + "pages": summary, + "default_session_id": default_id, }), - Some(format!("{} app(s) listed", n)), + Some(format!("{} page(s) found", pages.len())), )]) } - "get_app_state" => { - let app = parse_selector(params)?; - let max_depth = params - .get("max_depth") - .and_then(|v| v.as_u64()) - .unwrap_or(32) as u32; - let focus_window_only = params - .get("focus_window_only") - .and_then(|v| v.as_bool()) - .unwrap_or(false); - let snap = host - .get_app_state(app.clone(), max_depth, focus_window_only) - .await?; - let summary = format!( - "AX state for {} (digest={}, {} nodes)", - snap.app.name, - &snap.digest[..snap.digest.len().min(12)], - snap.nodes.len() - ); - let data = json!({ - "target_app": app, - "background_input": bg, - "ax_tree": ax, - "app_state": snap_state_json(&snap), - "app_state_nodes": snap.nodes, - "before_digest": snap.digest, - "loop_warning": snap.loop_warning, - }); - Ok(vec![snap_result(data, Some(summary), &snap)]) - } - "app_click" => { - let app = parse_selector(params)?; - let target_v = params.get("target").cloned().ok_or_else(|| { - BitFunError::tool( - "[INVALID_PARAMS] app_click requires 'target' ({node_idx|image_xy|screen_xy|ocr_text})" - .to_string(), - ) - })?; - let target = parse_click_target(&target_v)?; - let click_count = params - .get("click_count") - .and_then(|v| v.as_u64()) - .unwrap_or(1) as u8; - let mouse_button = params - .get("mouse_button") + + // Phase 2: filter pages by url substring / title substring without + // forcing the model to ingest the entire `list_pages` payload. + // This is essential when the user has dozens of tabs open and we + // don't want to dump 50 KB of CDP page records into context. + "tab_query" => { + let url_contains = params + .get("url_contains") .and_then(|v| v.as_str()) - .unwrap_or("left") - .to_string(); - let modifier_keys: Vec = params - .get("modifier_keys") - .and_then(|v| v.as_array()) - .map(|a| { - a.iter() - .filter_map(|x| x.as_str().map(|s| s.to_string())) - .collect() - }) - .unwrap_or_default(); - let wait_ms_after = params - .get("wait_ms_after") - .or_else(|| params.get("post_click_wait_ms")) + .map(str::to_lowercase); + let title_contains = params + .get("title_contains") + .and_then(|v| v.as_str()) + .map(str::to_lowercase); + let only_pages = params + .get("only_pages") + .and_then(|v| v.as_bool()) + .unwrap_or(true); + let limit = params + .get("limit") .and_then(|v| v.as_u64()) - .map(|v| v.min(5_000) as u32); - - let before = host - .get_app_state(app.clone(), 8, false) - .await - .ok() - .map(|s| s.digest); + .map(|n| n as usize) + .unwrap_or(20) + .max(1); - let mut after = host - .app_click(AppClickParams { - app: app.clone(), - target: target.clone(), - click_count, - mouse_button, - modifier_keys, - wait_ms_after, + let pages = CdpClient::list_pages(port).await?; + let default_id = browser_sessions().default_id().await; + let total = pages.len(); + let filtered: Vec = pages + .into_iter() + .filter(|p| { + if only_pages && p.page_type.as_deref() != Some("page") { + return false; + } + if let Some(ref needle) = url_contains { + if !p.url.to_lowercase().contains(needle) { + return false; + } + } + if let Some(ref needle) = title_contains { + if !p.title.to_lowercase().contains(needle) { + return false; + } + } + true }) - .await?; - - if after.loop_warning.is_none() { - let target_sig = serde_json::to_string(&target).unwrap_or_default(); - after.loop_warning = loop_tracker_observe( - app.pid, - "app_click", - &target_sig, - before.as_deref().unwrap_or(""), - &after.digest, - ); - } - - let data = json!({ - "target_app": app, - "click_target": target, - "background_input": bg, - "before_digest": before, - "app_state": snap_state_json(&after), - "app_state_nodes": after.nodes, - "loop_warning": after.loop_warning, - }); - Ok(vec![snap_result(data, Some("clicked".to_string()), &after)]) - } - "app_type_text" => { - let app = parse_selector(params)?; - let text = params - .get("text") - .and_then(|v| v.as_str()) - .ok_or_else(|| { - BitFunError::tool( - "[INVALID_PARAMS] app_type_text requires 'text'".to_string(), - ) - })? - .to_string(); - let focus: Option = match params.get("focus") { - Some(v) if !v.is_null() => Some(parse_click_target(v)?), - _ => None, - }; - let before = host - .get_app_state(app.clone(), 8, false) - .await - .ok() - .map(|s| s.digest); - let mut after = host - .app_type_text(app.clone(), &text, focus.clone()) - .await?; - if after.loop_warning.is_none() { - let target_sig = format!( - "focus={};len={}", - serde_json::to_string(&focus).unwrap_or_default(), - text.chars().count() - ); - after.loop_warning = loop_tracker_observe( - app.pid, - "app_type_text", - &target_sig, - before.as_deref().unwrap_or(""), - &after.digest, - ); - } - let data = json!({ - "target_app": app, - "background_input": bg, - "char_count": text.chars().count(), - "focus": focus, - "before_digest": before, - "app_state": snap_state_json(&after), - "app_state_nodes": after.nodes, - "loop_warning": after.loop_warning, - }); - Ok(vec![snap_result( - data, - Some(format!("typed {} chars", text.chars().count())), - &after, - )]) - } - "app_scroll" => { - let app = parse_selector(params)?; - let dx = params.get("dx").and_then(|v| v.as_i64()).unwrap_or(0) as i32; - let dy = params.get("dy").and_then(|v| v.as_i64()).unwrap_or(0) as i32; - let focus: Option = match params.get("focus") { - Some(v) if !v.is_null() => Some(parse_click_target(v)?), - _ => None, - }; - let after = host.app_scroll(app.clone(), focus.clone(), dx, dy).await?; - let data = json!({ - "target_app": app, - "background_input": bg, - "dx": dx, - "dy": dy, - "focus": focus, - "app_state": snap_state_json(&after), - "app_state_nodes": after.nodes, - "loop_warning": after.loop_warning, - }); - Ok(vec![snap_result( - data, - Some(format!("scrolled ({},{})", dx, dy)), - &after, - )]) - } - "app_key_chord" => { - let app = parse_selector(params)?; - let keys = parse_keys(params); - if keys.is_empty() { - return Err(BitFunError::tool( - "[INVALID_PARAMS] app_key_chord requires non-empty 'keys'".to_string(), - )); - } - let focus_idx: Option = params - .get("focus_idx") - .and_then(|v| v.as_u64()) - .map(|n| n as u32); - let after = host - .app_key_chord(app.clone(), keys.clone(), focus_idx) - .await?; - let data = json!({ - "target_app": app, - "background_input": bg, - "keys": keys, - "focus_idx": focus_idx, - "app_state": snap_state_json(&after), - "app_state_nodes": after.nodes, - "loop_warning": after.loop_warning, - }); - Ok(vec![snap_result( - data, - Some("key chord sent".to_string()), - &after, - )]) - } - "app_wait_for" => { - let app = parse_selector(params)?; - let predicate_v = params.get("predicate").cloned().ok_or_else(|| { - BitFunError::tool( - "[INVALID_PARAMS] app_wait_for requires 'predicate'".to_string(), - ) - })?; - let predicate = parse_wait_predicate(&predicate_v)?; - let timeout_ms = params - .get("timeout_ms") - .and_then(|v| v.as_u64()) - .unwrap_or(8000) as u32; - let poll_ms = params - .get("poll_ms") - .and_then(|v| v.as_u64()) - .unwrap_or(150) as u32; - let after = host - .app_wait_for(app.clone(), predicate.clone(), timeout_ms, poll_ms) - .await?; - let data = json!({ - "target_app": app, - "background_input": bg, - "predicate": predicate, - "app_state": snap_state_json(&after), - "app_state_nodes": after.nodes, - "loop_warning": after.loop_warning, - }); - Ok(vec![snap_result( - data, - Some("predicate satisfied".to_string()), - &after, - )]) - } - "build_interactive_view" => { - let app = parse_selector(params)?; - let opts: InteractiveViewOpts = match params.get("opts") { - Some(v) if !v.is_null() => serde_json::from_value(v.clone()).map_err(|e| { - BitFunError::tool(format!( - "[INVALID_PARAMS] build_interactive_view 'opts' invalid: {}", - e - )) - })?, - _ => InteractiveViewOpts::default(), - }; - let view = host.build_interactive_view(app.clone(), opts).await?; - let view_json = build_interactive_view_json(&view); - let summary = format!( - "interactive view for {} ({} elements, digest={})", - view.app.name, - view.elements.len(), - &view.digest[..view.digest.len().min(12)] - ); - Ok(vec![interactive_view_result( - view_json, - Some(summary), - &view, - )]) - } - "interactive_click" => { - let app = parse_selector(params)?; - let p: InteractiveClickParams = - serde_json::from_value(params.clone()).map_err(|e| { - BitFunError::tool(format!( - "[INVALID_PARAMS] interactive_click params invalid: {}", - e - )) - })?; - let i = p.i; - let res = host.interactive_click(app.clone(), p).await?; - let data = build_interactive_action_json( - &app, - &res, - json!({ "i": i, "action": "interactive_click" }), - ); - let summary = format!("interactive_click i={}", i); - Ok(vec![interactive_action_result(data, Some(summary), &res)]) - } - "build_visual_mark_view" => { - let app = parse_selector(params)?; - let opts: VisualMarkViewOpts = match params.get("opts") { - Some(v) if !v.is_null() => serde_json::from_value(v.clone()).map_err(|e| { - BitFunError::tool(format!( - "[INVALID_PARAMS] build_visual_mark_view 'opts' invalid: {}", - e - )) - })?, - _ => VisualMarkViewOpts::default(), - }; - let view = host.build_visual_mark_view(app.clone(), opts).await?; - let view_json = build_visual_mark_view_json(&view); - let summary = format!( - "visual mark view for {} ({} marks, digest={})", - view.app.name, - view.marks.len(), - &view.digest[..view.digest.len().min(12)] - ); - Ok(vec![visual_mark_view_result( - view_json, - Some(summary), - &view, - )]) - } - "visual_click" => { - let app = parse_selector(params)?; - let p: VisualClickParams = serde_json::from_value(params.clone()).map_err(|e| { - BitFunError::tool(format!( - "[INVALID_PARAMS] visual_click params invalid: {}", - e - )) - })?; - let i = p.i; - let res = host.visual_click(app.clone(), p).await?; - let data = build_visual_action_json( - &app, - &res, - json!({ "i": i, "action": "visual_click" }), - ); - let summary = format!("visual_click i={}", i); - Ok(vec![visual_action_result(data, Some(summary), &res)]) - } - "interactive_type_text" => { - let app = parse_selector(params)?; - let p: InteractiveTypeTextParams = - serde_json::from_value(params.clone()).map_err(|e| { - BitFunError::tool(format!( - "[INVALID_PARAMS] interactive_type_text params invalid: {}", - e - )) - })?; - let i = p.i; - let text_len = p.text.chars().count(); - let res = host.interactive_type_text(app.clone(), p).await?; - let data = build_interactive_action_json( - &app, - &res, - json!({ - "i": i, - "action": "interactive_type_text", - "text_chars": text_len, - }), - ); - let summary = match i { - Some(idx) => format!("interactive_type_text i={} ({} chars)", idx, text_len), - None => format!("interactive_type_text focused ({} chars)", text_len), - }; - Ok(vec![interactive_action_result(data, Some(summary), &res)]) - } - "interactive_scroll" => { - let app = parse_selector(params)?; - let p: InteractiveScrollParams = - serde_json::from_value(params.clone()).map_err(|e| { - BitFunError::tool(format!( - "[INVALID_PARAMS] interactive_scroll params invalid: {}", - e - )) - })?; - let (i, dx, dy) = (p.i, p.dx, p.dy); - let res = host.interactive_scroll(app.clone(), p).await?; - let data = build_interactive_action_json( - &app, - &res, - json!({ - "i": i, - "dx": dx, - "dy": dy, - "action": "interactive_scroll", - }), - ); - let summary = format!("interactive_scroll i={:?} dx={} dy={}", i, dx, dy); - Ok(vec![interactive_action_result(data, Some(summary), &res)]) - } - other => Err(BitFunError::tool(format!( - "[INTERNAL] handle_desktop_ax called with unknown action: {}", - other - ))), - } - } - - // ── Browser domain ───────────────────────────────────────────────── - - async fn handle_browser(&self, action: &str, params: &Value) -> BitFunResult> { - let port = params - .get("port") - .and_then(|v| v.as_u64()) - .map(|p| p as u16) - .unwrap_or(DEFAULT_CDP_PORT); - - let session_id_param = params - .get("session_id") - .and_then(|v| v.as_str()) - .map(str::to_string); - - match action { - "connect" => { - let mode = Self::browser_connect_mode_from_params(params); - let kind = BrowserLauncher::detect_default_browser()?; - - if mode == "headless" { - if !BrowserLauncher::is_cdp_available(port).await { - return Ok(err_response( - "browser", - "connect", - ControlHubError::new( - ErrorCode::NotAvailable, - format!( - "Headless browser test port {} is not available. Start the dedicated headless browser first, then connect via ControlHub browser actions.", - port - ), - ) - .with_hints(Self::headless_browser_connect_hints(port)), - )); - } - } - - let user_data_dir = params.get("user_data_dir").and_then(|v| v.as_str()); - let launch_result = if mode == "headless" { - LaunchResult::AlreadyConnected - } else { - BrowserLauncher::launch_with_cdp_opts(&kind, port, user_data_dir).await? - }; - - // UX shortcut: a frequent flow is "drive my Gmail tab" / - // "drive the GitHub PR I'm looking at". Without `target_*` - // the model needed `connect` → `list_pages` → `switch_page` - // (3 round-trips and one chance to pick the wrong id). With - // `target_url` / `target_title` we collapse those into a - // single `connect` call: pick the first page whose URL or - // title contains the substring, register it as the default - // session, and bring it to the front. - let target_url = params - .get("target_url") - .and_then(|v| v.as_str()) - .map(str::to_lowercase); - let target_title = params - .get("target_title") - .and_then(|v| v.as_str()) - .map(str::to_lowercase); - let activate = params - .get("activate") - .and_then(|v| v.as_bool()) - .unwrap_or(true); - - match &launch_result { - LaunchResult::AlreadyConnected | LaunchResult::Launched => { - let pages = CdpClient::list_pages(port).await?; - let connected_browser = if mode == "headless" { - "Headless test browser".to_string() - } else { - kind.to_string() - }; - - // Selection: explicit target_* > first real page > first. - let matched_by_target = if target_url.is_some() || target_title.is_some() { - pages.iter().find(|p| { - if p.web_socket_debugger_url.is_none() { - return false; - } - let url_ok = target_url - .as_ref() - .map(|n| p.url.to_lowercase().contains(n)) - .unwrap_or(true); - let title_ok = target_title - .as_ref() - .map(|n| p.title.to_lowercase().contains(n)) - .unwrap_or(true); - p.page_type.as_deref() == Some("page") && url_ok && title_ok - }) - } else { - None - }; - - // Tell the model when its filter found nothing instead - // of silently falling back to the first tab and - // confusing the next action. - if (target_url.is_some() || target_title.is_some()) - && matched_by_target.is_none() - { - return Ok(err_response( - "browser", - "connect", - ControlHubError::new( - ErrorCode::WrongTab, - format!( - "No open tab matched target_url={:?} target_title={:?}", - target_url, target_title - ), - ) - .with_hints([ - "Call browser.list_pages or browser.tab_query first to inspect open tabs", - "Loosen the substring (e.g. domain only) and try again", - ]), - )); - } - - let page = matched_by_target - .or_else(|| { - pages.iter().find(|p| { - p.page_type.as_deref() == Some("page") - && p.web_socket_debugger_url.is_some() - }) - }) - .or_else(|| pages.first()) - .ok_or_else(|| { - BitFunError::tool("No browser pages found via CDP".to_string()) - })?; - let ws_url = page.web_socket_debugger_url.as_ref().ok_or_else(|| { - BitFunError::tool("Page has no WebSocket debugger URL".to_string()) - })?; - let client = CdpClient::connect(ws_url).await?; - let version = CdpClient::get_version(port).await?; - let session = BrowserSession { - session_id: page.id.clone(), - port, - client: Arc::new(client), - }; - browser_sessions().register(session.clone()).await; - - // If the model targeted a specific tab AND wants it - // foregrounded (default), bring it to front the same - // way switch_page does. Failure here is non-fatal — - // we still return the connected session. - let mut activated = false; - let mut activate_warning: Option = None; - let targeted = matched_by_target.is_some(); - if targeted && activate { - match session.client.send("Page.bringToFront", None).await { - Ok(_) => activated = true, - Err(e) => { - activate_warning = Some(format!( - "Page.bringToFront failed: {} (session is connected, but the tab is not in the foreground)", - e - )); - } - } - } - - let mut result = json!({ - "success": true, - "browser": connected_browser, - "browser_mode": mode, - "browser_version": version.browser, - "port": port, - "session_id": session.session_id, - "page_url": page.url, - "page_title": page.title, - "matched_by_target": targeted, - "activated": activated, - "status": if mode == "headless" { - "attached" - } else if matches!(launch_result, LaunchResult::AlreadyConnected) { - "already_connected" - } else { - "launched" - }, - }); - if let Some(w) = activate_warning { - result["warning"] = json!(w); - } - let summary = if targeted { - format!( - "Connected to {} via DOM/CDP (session {}, page '{}')", - connected_browser, session.session_id, page.title - ) - } else { - format!( - "Connected to {} on test port {} via DOM/CDP (session {})", - connected_browser, port, session.session_id - ) - }; - Ok(vec![ToolResult::ok(result, Some(summary))]) - } - LaunchResult::LaunchedButCdpNotReady { message, .. } => Ok(err_response( - "browser", - "connect", - ControlHubError::new(ErrorCode::Timeout, message.clone()) - .with_hints(Self::default_browser_connect_hints(&kind, port)), - )), - LaunchResult::BrowserRunningWithoutCdp { instructions, .. } => Ok(err_response( - "browser", - "connect", - ControlHubError::new( - ErrorCode::NotAvailable, - "The user's default browser is running without the test port enabled.", - ) - .with_hint(instructions) - .with_hints(Self::default_browser_connect_hints(&kind, port)), - )), - } - } - - "list_pages" => { - let pages = CdpClient::list_pages(port).await?; - let default_id = browser_sessions().default_id().await; - let summary: Vec = pages - .iter() + .take(limit) .map(|p| { json!({ "id": p.id, "title": p.title, - "url": p.url, - "type": p.page_type, - "is_default_session": Some(&p.id) == default_id.as_ref(), - }) - }) - .collect(); - Ok(vec![ToolResult::ok( - json!({ - "pages": summary, - "default_session_id": default_id, - }), - Some(format!("{} page(s) found", pages.len())), - )]) - } - - // Phase 2: filter pages by url substring / title substring without - // forcing the model to ingest the entire `list_pages` payload. - // This is essential when the user has dozens of tabs open and we - // don't want to dump 50 KB of CDP page records into context. - "tab_query" => { - let url_contains = params - .get("url_contains") - .and_then(|v| v.as_str()) - .map(str::to_lowercase); - let title_contains = params - .get("title_contains") - .and_then(|v| v.as_str()) - .map(str::to_lowercase); - let only_pages = params - .get("only_pages") - .and_then(|v| v.as_bool()) - .unwrap_or(true); - let limit = params - .get("limit") - .and_then(|v| v.as_u64()) - .map(|n| n as usize) - .unwrap_or(20) - .max(1); - - let pages = CdpClient::list_pages(port).await?; - let default_id = browser_sessions().default_id().await; - let total = pages.len(); - let filtered: Vec = pages - .into_iter() - .filter(|p| { - if only_pages && p.page_type.as_deref() != Some("page") { - return false; - } - if let Some(ref needle) = url_contains { - if !p.url.to_lowercase().contains(needle) { - return false; - } - } - if let Some(ref needle) = title_contains { - if !p.title.to_lowercase().contains(needle) { - return false; - } - } - true - }) - .take(limit) - .map(|p| { - json!({ - "id": p.id, - "title": p.title, - "url": p.url, - "type": p.page_type, - "is_default_session": Some(&p.id) == default_id.as_ref(), - }) - }) - .collect(); - let matched = filtered.len(); - Ok(vec![ToolResult::ok( - json!({ - "pages": filtered, - "matched": matched, - "total": total, - "default_session_id": default_id, - }), - Some(format!("{} of {} page(s) matched", matched, total)), - )]) - } - - "switch_page" => { - let page_id = params - .get("page_id") - .and_then(|v| v.as_str()) - .ok_or_else(|| { - BitFunError::tool("switch_page requires 'page_id'".to_string()) - })?; - // Phase 2: by default ALSO surface the chosen tab in the - // user's actual browser window via `Page.bringToFront`. The - // legacy behavior only swapped the CDP session under the - // hood, leaving the user staring at the old tab while the - // model "drove" an invisible one. Models can opt out by - // passing `activate: false` for headless background tabs. - let activate = params - .get("activate") - .and_then(|v| v.as_bool()) - .unwrap_or(true); - - let registry = browser_sessions(); - let mut reused = false; - let session = if registry.set_default(page_id).await.is_ok() { - reused = true; - registry.get(Some(page_id)).await? - } else { - let pages = CdpClient::list_pages(port).await?; - let page = pages.iter().find(|p| p.id == page_id).ok_or_else(|| { - BitFunError::tool(format!("Page '{}' not found", page_id)) - })?; - let ws_url = page.web_socket_debugger_url.as_ref().ok_or_else(|| { - BitFunError::tool("Page has no WebSocket URL".to_string()) - })?; - let client = CdpClient::connect(ws_url).await?; - let session = BrowserSession { - session_id: page.id.clone(), - port, - client: Arc::new(client), - }; - registry.register(session.clone()).await; - session - }; - - let mut activated = false; - let mut activate_warning: Option = None; - if activate { - match session.client.send("Page.bringToFront", None).await { - Ok(_) => activated = true, - Err(e) => { - // Don't fail the whole switch — the session is - // still valid, the user just won't see the new - // tab front-and-center yet. - activate_warning = Some(format!( - "Page.bringToFront failed: {} (session is switched, but the tab is not in the foreground)", - e - )); - } - } - } - - let mut body = json!({ - "success": true, - "page_id": page_id, - "session_id": session.session_id, - "reused": reused, - "activated": activated, - }); - if let Some(w) = &activate_warning { - body["warning"] = json!(w); - } - Ok(vec![ToolResult::ok( - body, - Some(format!( - "Switched to page {} ({})", - page_id, - if activated { - "brought to front" - } else { - "background" - } - )), - )]) - } - - "list_sessions" => { - let registry = browser_sessions(); - let ids = registry.list().await; - let default = registry.default_id().await; - Ok(vec![ToolResult::ok( - json!({ - "sessions": ids, - "default_session_id": default, - }), - Some(format!("{} session(s) tracked", ids.len())), - )]) - } - - _ => { - // Resolve a session: explicit `session_id` if present, else - // the registry's default. This replaces the prior "global - // singleton" pattern that was racy across concurrent tasks. - let session = browser_sessions().get(session_id_param.as_deref()).await?; - let actions = BrowserActions::new(session.client.as_ref()); - - match action { - "navigate" => { - let url = params - .get("url") - .and_then(|v| v.as_str()) - .ok_or_else(|| { - BitFunError::tool("navigate requires 'url'".to_string()) - })?; - let result = actions.navigate(url).await?; - Ok(vec![ToolResult::ok(result, Some(format!("Navigated to {}", url)))]) - } - "snapshot" => { - let with_backend = params - .get("with_backend_node_ids") - .and_then(|v| v.as_bool()) - .unwrap_or(false); - let result = actions.snapshot_with_options(with_backend).await?; - let el_count = result - .get("elements") - .and_then(|v| v.as_array()) - .map(|a| a.len()) - .unwrap_or(0); - Ok(vec![ToolResult::ok( - result, - Some(format!("Snapshot: {} interactive elements", el_count)), - )]) - } - "click" => { - let selector = params - .get("selector") - .and_then(|v| v.as_str()) - .ok_or_else(|| { - BitFunError::tool("click requires 'selector'".to_string()) - })?; - let result = actions.click(selector).await?; - Ok(vec![ToolResult::ok( - result, - Some(format!("Clicked {}", selector)), - )]) - } - "fill" => { - let selector = params - .get("selector") - .and_then(|v| v.as_str()) - .ok_or_else(|| { - BitFunError::tool("fill requires 'selector'".to_string()) - })?; - let value = params - .get("value") - .and_then(|v| v.as_str()) - .ok_or_else(|| { - BitFunError::tool("fill requires 'value'".to_string()) - })?; - let result = actions.fill(selector, value).await?; - Ok(vec![ToolResult::ok( - result, - Some(format!("Filled {} with text", selector)), - )]) - } - "type" => { - let text = params - .get("text") - .and_then(|v| v.as_str()) - .ok_or_else(|| { - BitFunError::tool("type requires 'text'".to_string()) - })?; - let result = actions.type_text(text).await?; - Ok(vec![ToolResult::ok(result, Some("Typed text".to_string()))]) - } - "select" => { - let selector = params - .get("selector") - .and_then(|v| v.as_str()) - .ok_or_else(|| { - BitFunError::tool("select requires 'selector'".to_string()) - })?; - let option_text = params - .get("option_text") - .and_then(|v| v.as_str()) - .ok_or_else(|| { - BitFunError::tool("select requires 'option_text'".to_string()) - })?; - let result = actions.select(selector, option_text).await?; - // Phase 3: the underlying JS returns `{ error, available }` - // shaped success bodies for "select not found" and - // "option not found" cases. Lift those into the - // unified ControlHub error envelope so the model can - // branch on `error.code` instead of scraping JSON. - if let Some(err_msg) = result.get("error").and_then(|v| v.as_str()) { - let lowered = err_msg.to_lowercase(); - let (code, hint) = if lowered.contains("select not found") { - ( - ErrorCode::NotFound, - format!( - "No matched '{}'. Take a fresh snapshot and verify the selector.", + selector + ), + ) + } else if lowered.contains("option not found") { + ( + ErrorCode::NotFound, + "Inspect `available` in error.hints for valid option labels." + .to_string(), + ) } else { - last_err = Some(format!( - "{} exit={:?} stderr={}", - cmd, - out.status.code(), - String::from_utf8_lossy(&out.stderr).trim() + (ErrorCode::Internal, "Browser returned an unexpected select error".to_string()) + }; + let mut chub_err = ControlHubError::new(code, err_msg) + .with_hint(hint); + if let Some(avail) = result.get("available") { + chub_err = chub_err.with_hint(format!( + "available_options={}", + avail )); } + return Ok(err_response("browser", "select", chub_err)); } - Err(e) => { - last_err = Some(format!("spawn {}: {}", cmd, e)); - } + Ok(vec![ToolResult::ok( + result, + Some(format!("Selected '{}'", option_text)), + )]) } - } - let _ = chosen_args; - let output = output_opt.ok_or_else(|| { - BitFunError::tool(format!( - "open_app failed for '{}' across {} strategies: {} (host_error: {:?})", - app_name, - attempts.len(), - last_err.as_deref().unwrap_or("(no error)"), - host_error - )) - })?; - - if output.status.success() { - let warning = host_error.map(|e| { - format!("computer_use_host open_app failed; shell fallback succeeded: {}", e) - }); - Ok(vec![ToolResult::ok( - json!({ - "launched": true, - "app": app_name, - "method": method, - "via_command": chosen_cmd, - "host_attempted": host_attempted, - "warning": warning, - }), - Some(format!("Opened {} via {}", app_name, chosen_cmd)), - )]) - } else { - let stderr = String::from_utf8_lossy(&output.stderr).trim().to_string(); - Err(BitFunError::tool(format!( - "open_app failed for '{}'. host_attempted={}, host_error={:?}, last_command='{}', stderr='{}'", - app_name, host_attempted, host_error, chosen_cmd, stderr - ))) - } - } - "run_script" => { - let script = params - .get("script") - .and_then(|v| v.as_str()) - .ok_or_else(|| BitFunError::tool("run_script requires 'script'".to_string()))?; - let script_type = params - .get("script_type") - .and_then(|v| v.as_str()) - .unwrap_or("applescript"); - // Phase 4: bound the runtime so a hung script can never wedge - // the agent. Default 30 s, capped at 5 min to keep it sane. - let timeout_ms = params - .get("timeout_ms") - .and_then(|v| v.as_u64()) - .unwrap_or(30_000) - .clamp(100, 5 * 60 * 1000); - // Phase 4: keep output payloads bounded — model context is - // expensive and most scripts are happy with the head + tail. - let max_output_bytes = params - .get("max_output_bytes") - .and_then(|v| v.as_u64()) - .unwrap_or(16 * 1024) - .clamp(1024, 256 * 1024) as usize; - - let (program, args) = match script_type { - "applescript" => { - #[cfg(target_os = "macos")] - { - ( - "/usr/bin/osascript".to_string(), - vec!["-e".to_string(), script.to_string()], - ) - } - #[cfg(not(target_os = "macos"))] - { - let _ = script; - return Ok(err_response( - "system", - "run_script", - ControlHubError::new( - ErrorCode::NotAvailable, - "AppleScript is only available on macOS", - ) - .with_hint("Use script_type='shell' (sh on Unix, PowerShell on Windows) or script_type='powershell'/'bash'"), - )); - } + "press_key" => { + let key = params + .get("key") + .and_then(|v| v.as_str()) + .ok_or_else(|| { + BitFunError::tool("press_key requires 'key'".to_string()) + })?; + let result = actions.press_key(key).await?; + Ok(vec![ToolResult::ok( + result, + Some(format!("Pressed {}", key)), + )]) } - // The "shell" alias picks the OS's *default* shell so the - // model can stay platform-agnostic. On Windows we now - // route to PowerShell rather than cmd.exe to avoid the - // GBK/CP936 stdout encoding nightmare and to give the - // model a consistent surface area. - "shell" => { - #[cfg(target_os = "windows")] - { - powershell_invocation(script) - } - #[cfg(not(target_os = "windows"))] - { - ( - "sh".to_string(), - vec!["-c".to_string(), script.to_string()], - ) - } + "scroll" => { + let direction = params + .get("direction") + .and_then(|v| v.as_str()) + .unwrap_or("down"); + let amount = params.get("amount").and_then(|v| v.as_i64()); + let result = actions.scroll(direction, amount).await?; + Ok(vec![ToolResult::ok( + result, + Some(format!("Scrolled {}", direction)), + )]) } - "bash" => { - // Bash is universally requested but not always on - // PATH (Windows without WSL/git-bash). Detect and - // surface a structured NotAvailable instead of a - // confusing spawn-failure error. - if !which_exists("bash") { - return Ok(err_response( - "system", - "run_script", - ControlHubError::new( - ErrorCode::NotAvailable, - "bash is not on PATH", - ) - .with_hint("Install Git for Windows / WSL, or use script_type='shell' / 'powershell' / 'cmd'"), - )); - } - ( - "bash".to_string(), - vec!["-c".to_string(), script.to_string()], - ) + "wait" => { + let ms = params.get("duration_ms").and_then(|v| v.as_u64()); + let cond = params.get("condition").and_then(|v| v.as_str()); + let result = actions.wait(ms, cond).await?; + Ok(vec![ToolResult::ok(result, Some("Wait completed".to_string()))]) } - "powershell" => { - // Prefer pwsh (PowerShell 7+, cross-platform) when - // available; fall back to legacy Windows powershell. - let prog = if which_exists("pwsh") { - "pwsh" - } else if which_exists("powershell") { - "powershell" - } else { - return Ok(err_response( - "system", - "run_script", + "get_text" => { + let selector = params + .get("selector") + .and_then(|v| v.as_str()) + .ok_or_else(|| { + BitFunError::tool("get_text requires 'selector'".to_string()) + })?; + match actions.get_text(selector).await? { + Some(text) => Ok(vec![ToolResult::ok( + json!({ "text": text, "found": true }), + Some(text), + )]), + None => Ok(err_response( + "browser", + "get_text", ControlHubError::new( - ErrorCode::NotAvailable, - "Neither pwsh nor powershell are on PATH", + ErrorCode::NotFound, + format!("No element matched selector '{}'", selector), ) - .with_hint("Install PowerShell, or use script_type='shell' / 'bash'"), - )); - }; - ( - prog.to_string(), - vec![ - "-NoProfile".to_string(), - "-NonInteractive".to_string(), - // -OutputEncoding utf8 is set inside the script - // wrapper below for consistent stdout handling. - "-Command".to_string(), - format!( - "[Console]::OutputEncoding=[Text.Encoding]::UTF8; {}", - script + .with_hint( + "Take a fresh snapshot and verify the @ref / CSS selector", ), - ], - ) - } - "cmd" => { - #[cfg(target_os = "windows")] - { - // Force code-page 65001 (UTF-8) before running the - // user's script so stdout matches what we decode. - ( - "cmd".to_string(), - vec![ - "/U".to_string(), - "/C".to_string(), - format!("chcp 65001>nul && {}", script), - ], - ) - } - #[cfg(not(target_os = "windows"))] - { - return Ok(err_response( - "system", - "run_script", - ControlHubError::new( - ErrorCode::NotAvailable, - "script_type='cmd' is only available on Windows", - ) - .with_hint("Use script_type='shell' / 'bash' / 'powershell'"), - )); + )), } } - other => { - return Err(BitFunError::tool(format!( - "Unknown script_type: '{}'. Valid: applescript (macOS), shell (OS default), bash, powershell, cmd (Windows)", - other - ))) - } - }; - - // Use tokio::process so that on timeout we can actually KILL - // the child process. The previous implementation wrapped - // `std::process::Command::output()` in `spawn_blocking` + - // `tokio::time::timeout`; on timeout the `timeout` future - // returned, but the spawn_blocking thread kept blocking on - // the still-running child, leaking a thread + process per - // hung script. - let started = std::time::Instant::now(); - let child = tokio::process::Command::new(&program) - .args(&args) - .stdout(std::process::Stdio::piped()) - .stderr(std::process::Stdio::piped()) - .kill_on_drop(true) - .spawn() - .map_err(|e| { - BitFunError::tool(format!( - "Failed to spawn run_script ({}): {}", - script_type, e - )) - })?; - - let wait = child.wait_with_output(); - let output = match tokio::time::timeout( - std::time::Duration::from_millis(timeout_ms), - wait, - ) - .await - { - Err(_) => { - // Best-effort kill. `kill_on_drop(true)` above also - // ensures the OS reaps the process when `child` - // drops, but we issue an explicit SIGKILL first so - // it terminates immediately rather than after the - // tokio task tear-down race. - // NOTE: `wait_with_output` consumed `child`, so we - // can no longer call `child.kill()` directly here; - // the `kill_on_drop` flag handles it for us. - return Ok(err_response( - "system", - "run_script", - ControlHubError::new( - ErrorCode::Timeout, - format!( - "run_script timed out after {} ms (script_type={}); child process killed", - timeout_ms, script_type - ), - ) - .with_hint( - "Increase 'timeout_ms', or split the script into shorter steps", - ), - )); - } - Ok(Err(e)) => { - return Err(BitFunError::tool(format!( - "Failed to wait for run_script ({}): {}", - script_type, e - ))); - } - Ok(Ok(o)) => o, - }; - - let elapsed_ms = elapsed_ms_u64(started); - let stdout_full = String::from_utf8_lossy(&output.stdout).to_string(); - let stderr_full = String::from_utf8_lossy(&output.stderr).to_string(); - let (stdout, stdout_truncated) = truncate_with_marker(&stdout_full, max_output_bytes); - let (stderr, stderr_truncated) = truncate_with_marker(&stderr_full, max_output_bytes); - - if output.status.success() { - Ok(vec![ToolResult::ok( - json!({ - "success": true, - "output": stdout, - "stderr": stderr, - "stdout_truncated": stdout_truncated, - "stderr_truncated": stderr_truncated, - "exit_code": output.status.code(), - "elapsed_ms": elapsed_ms, - "script_type": script_type, - }), - Some(if stdout.is_empty() { - format!("Script executed in {} ms", elapsed_ms) - } else { - stdout.lines().take(1).collect::() - }), - )]) - } else { - Ok(err_response( - "system", - "run_script", - ControlHubError::new( - ErrorCode::Internal, - format!( - "Script exited with {:?}: {}", - output.status.code(), - stderr.lines().next().unwrap_or("(no stderr)") - ), - ) - .with_hints([ - format!("stderr={}", stderr), - format!("elapsed_ms={}", elapsed_ms), - ]), - )) - } - } - "get_os_info" => { - let os = std::env::consts::OS; - let arch = std::env::consts::ARCH; - // Phase 4: include OS version + hostname when available so - // the model can adapt platform-specific paths / commands. - let mut info = json!({ - "os": os, - "arch": arch, - "rust_target_family": std::env::consts::FAMILY, - }); - if let Some(v) = read_os_version() { - info["os_version"] = json!(v); - } - if let Ok(host) = hostname() { - info["hostname"] = json!(host); - } - // Linux-only: surface display server (X11 / Wayland) and the - // current desktop environment so the model can pick the right - // clipboard helper / window manipulation strategy without a - // separate `run_script` round-trip. - #[cfg(target_os = "linux")] - { - let (display_server, desktop_env) = linux_session_info(); - if let Some(s) = display_server { - info["display_server"] = json!(s); + "get_url" => { + let url = actions.get_url().await?; + Ok(vec![ToolResult::ok( + json!({ "url": url }), + Some(url), + )]) } - if let Some(d) = desktop_env { - info["desktop_environment"] = json!(d); + "get_title" => { + let title = actions.get_title().await?; + Ok(vec![ToolResult::ok( + json!({ "title": title }), + Some(title), + )]) } - } - // The set of `script_type` values the host can actually run. - // Discoverability win: model no longer has to spawn a doomed - // run_script call to learn that bash is missing on Windows. - let mut script_types = vec!["shell"]; - if cfg!(target_os = "macos") { - script_types.push("applescript"); - } - if which_exists("bash") { - script_types.push("bash"); - } - if which_exists("pwsh") || which_exists("powershell") { - script_types.push("powershell"); - } - if cfg!(target_os = "windows") { - script_types.push("cmd"); - } - info["script_types"] = json!(script_types); - Ok(vec![ToolResult::ok( - info.clone(), - Some(format!( - "{} {} ({})", - os, - info.get("os_version").and_then(|v| v.as_str()).unwrap_or(""), - arch - )), - )]) - } - // Cross-context primitive: read the system clipboard. Used by - // models to pick up "what the user just copied" (verification - // codes, selected text, generated SQL, etc.) without driving - // the GUI. Returns text only — binary clipboard payloads are - // out of scope. - "clipboard_get" => { - let max_bytes = params - .get("max_bytes") - .and_then(|v| v.as_u64()) - .map(|n| n as usize) - .unwrap_or(64 * 1024) - .clamp(64, 1024 * 1024); - - match clipboard_read().await { - Ok(text) => { - let (truncated, was_truncated) = truncate_with_marker(&text, max_bytes); - let len = text.len(); + "screenshot" => { + let result = actions.screenshot().await?; + let data_len = result + .get("data_length") + .and_then(|v| v.as_u64()) + .unwrap_or(0); Ok(vec![ToolResult::ok( - json!({ - "text": truncated, - "byte_length": len, - "truncated": was_truncated, - }), - Some(format!("{} bytes on clipboard", len)), + result, + Some(format!("Screenshot captured ({} bytes base64)", data_len)), )]) } - Err(e) => Ok(err_response( - "system", - "clipboard_get", - ControlHubError::new( - ErrorCode::NotAvailable, - format!("Clipboard read failed: {}", e), - ) - .with_hints(linux_clipboard_install_hints()), - )), + "evaluate" => { + let expression = params + .get("expression") + .and_then(|v| v.as_str()) + .ok_or_else(|| { + BitFunError::tool("evaluate requires 'expression'".to_string()) + })?; + // Bound the size of the returned value so a runaway + // `JSON.stringify(document)` can't blow up the model + // context window. Default 16 KiB; clamp to [1 KiB, 256 KiB]. + let max_value_bytes = params + .get("max_value_bytes") + .and_then(|v| v.as_u64()) + .unwrap_or(16 * 1024) + .clamp(1024, 256 * 1024) as usize; + let mut result = actions.evaluate(expression).await?; + let mut truncated = false; + if let Some(value) = result.pointer_mut("/result/value") { + let serialized = value.to_string(); + if serialized.len() > max_value_bytes { + let (clip, was) = + truncate_with_marker(&serialized, max_value_bytes); + truncated = was; + *value = json!(clip); + } + } + if let Some(obj) = result.as_object_mut() { + obj.insert("truncated".to_string(), json!(truncated)); + } + let display = result + .get("result") + .and_then(|r| r.get("value")) + .map(|v| v.to_string()) + .unwrap_or_else(|| result.to_string()); + Ok(vec![ToolResult::ok(result, Some(display))]) + } + "close" => { + let result = actions.close_page().await?; + // After a close, drop the session so subsequent calls + // don't try to talk through a half-dead WebSocket. + browser_sessions().remove(&session.session_id).await; + Ok(vec![ToolResult::ok(result, Some("Page closed".to_string()))]) + } + other => Err(BitFunError::tool(format!( + "Unknown browser action: '{}'. Valid: connect, navigate, snapshot, click, fill, type, select, press_key, scroll, wait, get_text, get_url, get_title, screenshot, evaluate, close, list_pages, switch_page, list_sessions", + other + ))), } } + } + } - // Cross-context primitive: place text on the system clipboard. - // The user can then paste it into ANY app with cmd+v / ctrl+v — - // dramatically simpler than driving each target GUI by hand. - "clipboard_set" => { - let text = params.get("text").and_then(|v| v.as_str()).ok_or_else(|| { - BitFunError::tool("clipboard_set requires 'text'".to_string()) - })?; - match clipboard_write(text).await { - Ok(()) => Ok(vec![ToolResult::ok( - json!({ - "success": true, - "byte_length": text.len(), - }), - Some(format!("Wrote {} bytes to clipboard", text.len())), - )]), - Err(e) => Ok(err_response( - "system", - "clipboard_set", - ControlHubError::new( - ErrorCode::NotAvailable, - format!("Clipboard write failed: {}", e), - ) - .with_hints(linux_clipboard_install_hints()), - )), - } - } + // ── Terminal domain ──────────────────────────────────────────────── - // Cross-context primitive: open a URL in the user's default - // browser WITHOUT going through CDP. Use this when the goal is - // "show this URL to the user" rather than "drive this page". - // Avoids the CDP launch round-trip and works even when the - // browser was started without --remote-debugging-port. - "open_url" => { - let url = params - .get("url") - .and_then(|v| v.as_str()) - .ok_or_else(|| BitFunError::tool("open_url requires 'url'".to_string()))?; - if !(url.starts_with("http://") - || url.starts_with("https://") - || url.starts_with("file://") - || url.starts_with("mailto:")) - { + async fn handle_terminal( + &self, + action: &str, + params: &Value, + context: &ToolUseContext, + ) -> BitFunResult> { + // Phase 4: enumerate live terminal sessions so the model can resolve + // a `terminal_session_id` *before* attempting `kill` / `interrupt`. + // Previously this required digging through earlier `Bash` results. + if action == "list_sessions" { + let api = crate::service::terminal::api::TerminalApi::from_singleton() + .map_err(|e| BitFunError::tool(format!("TerminalApi unavailable: {}", e)))?; + let sessions = api + .list_sessions() + .await + .map_err(|e| BitFunError::tool(format!("list_sessions failed: {}", e)))?; + let summary: Vec = sessions + .iter() + .map(|s| { + json!({ + "terminal_session_id": s.id, + "name": s.name, + "cwd": s.cwd, + "pid": s.pid, + "status": s.status, + }) + }) + .collect(); + let count = summary.len(); + return Ok(vec![ToolResult::ok( + json!({ "sessions": summary, "count": count }), + Some(format!("{} terminal session(s) live", count)), + )]); + } + + // UX shortcut: when there is exactly one live terminal session, + // make `terminal_session_id` optional. The 95th-percentile flow is + // "Bash launched a long-running command, please interrupt it" and + // the user has no other terminals open — forcing a `list_sessions` + // round-trip just to copy the only id back wastes a turn. + let resolved_id: String = match params.get("terminal_session_id").and_then(|v| v.as_str()) { + Some(s) => s.to_string(), + None => { + let api = crate::service::terminal::api::TerminalApi::from_singleton() + .map_err(|e| BitFunError::tool(format!("TerminalApi unavailable: {}", e)))?; + let sessions = api + .list_sessions() + .await + .map_err(|e| BitFunError::tool(format!("list_sessions failed: {}", e)))?; + let live: Vec<_> = sessions + .iter() + .filter(|s| { + s.status.eq_ignore_ascii_case("running") + || s.status.eq_ignore_ascii_case("active") + || s.status.eq_ignore_ascii_case("idle") + }) + .collect(); + if live.len() == 1 { + live[0].id.clone() + } else if live.is_empty() { return Ok(err_response( - "system", - "open_url", + "terminal", + action, ControlHubError::new( - ErrorCode::InvalidParams, - format!("Refusing to open URL with unsupported scheme: {}", url), + ErrorCode::MissingSession, + "No live terminal sessions to target", ) .with_hint( - "Pass an http(s)://, file://, or mailto: URL. Use 'open_file' for local paths without a scheme.", + "Use the Bash tool to start a command, then this action becomes meaningful", ), )); - } - // NOTE: do NOT reuse platform_open_command — that helper - // is for *apps* (uses `open -a` on macOS) and would treat - // the URL as an application name, failing immediately. - // - // Windows: must NOT route through `cmd /C start "" `. - // `cmd` interprets `&`, `^`, `%`, `|` in the URL — so a query - // string like `?a=1&b=2` gets the second arg dropped, and - // long URLs may be silently truncated. Use rundll32 with the - // URL protocol handler so the URL is passed verbatim and - // routed through the same default-handler resolution Windows - // uses for "Open in Browser" shell verbs. - let (program, args) = match std::env::consts::OS { - "macos" => ("open".to_string(), vec![url.to_string()]), - "windows" => ( - "rundll32".to_string(), - vec![ - "url.dll,FileProtocolHandler".to_string(), - url.to_string(), - ], - ), - _ => ("xdg-open".to_string(), vec![url.to_string()]), - }; - let status = std::process::Command::new(&program) - .args(&args) - .status() - .map_err(|e| { - BitFunError::tool(format!("Failed to spawn '{}': {}", program, e)) - })?; - if status.success() { - Ok(vec![ToolResult::ok( - json!({ "opened": true, "url": url, "method": program }), - Some(format!("Opened {} in default handler", url)), - )]) } else { - Ok(err_response( - "system", - "open_url", - ControlHubError::new( - ErrorCode::Internal, - format!("'{}' exited with {:?}", program, status.code()), - ), - )) - } - } - - // Cross-context primitive: open a local file with its default - // handler (or an explicitly named app on macOS). High-frequency - // for "open this PDF / picture / spreadsheet for me". - "open_file" => { - let path_str = params.get("path").and_then(|v| v.as_str()).ok_or_else(|| { - BitFunError::tool("open_file requires 'path'".to_string()) - })?; - let app_name = params.get("app").and_then(|v| v.as_str()); - - let path = std::path::Path::new(path_str); - if !path.exists() { + let ids: Vec<&str> = live.iter().map(|s| s.id.as_str()).collect(); return Ok(err_response( - "system", - "open_file", + "terminal", + action, ControlHubError::new( - ErrorCode::NotFound, - format!("File does not exist: {}", path_str), + ErrorCode::Ambiguous, + format!( + "{} live terminal sessions; pass 'terminal_session_id' to disambiguate", + live.len() + ), ) - .with_hint("Check the absolute path; ~ is not expanded"), + .with_hint(format!("live_session_ids={:?}", ids)) + .with_hint("Call terminal.list_sessions to see names + cwd"), )); } - - let (program, args) = match (std::env::consts::OS, app_name) { - ("macos", Some(app)) => ( - "open".to_string(), - vec!["-a".to_string(), app.to_string(), path_str.to_string()], - ), - ("macos", None) => ("open".to_string(), vec![path_str.to_string()]), - // Windows file open: same rundll32 dance as open_url so - // paths with `&` / `%` survive intact when cmd would have - // mangled them. ShellExec_RunDLL also accepts file paths. - ("windows", _) => ( - "rundll32".to_string(), - vec![ - "url.dll,FileProtocolHandler".to_string(), - path_str.to_string(), - ], - ), - _ => ("xdg-open".to_string(), vec![path_str.to_string()]), - }; - let status = std::process::Command::new(&program) - .args(&args) - .status() - .map_err(|e| { - BitFunError::tool(format!("Failed to spawn '{}': {}", program, e)) - })?; - if status.success() { - Ok(vec![ToolResult::ok( - json!({ - "opened": true, - "path": path_str, - "with_app": app_name, - "method": program, - }), - Some(match app_name { - Some(a) => format!("Opened {} with {}", path_str, a), - None => format!("Opened {} with default handler", path_str), - }), - )]) - } else { - Ok(err_response( - "system", - "open_file", - ControlHubError::new( - ErrorCode::Internal, - format!("'{}' exited with {:?}", program, status.code()), - ), - )) - } } + }; - other => Err(BitFunError::tool(format!( - "Unknown system action: '{}'. Valid: open_app, run_script, get_os_info, open_url, open_file, clipboard_get, clipboard_set", - other - ))), + let mut input = params.clone(); + if let Value::Object(ref mut map) = input { + map.insert("action".to_string(), json!(action)); + map.insert("terminal_session_id".to_string(), json!(resolved_id)); } - } -} -/// Truncate `s` to at most `max_bytes`, appending an explicit marker so the -/// model can see that data was dropped (and how much). Returns -/// `(truncated_string, was_truncated)`. -fn truncate_with_marker(s: &str, max_bytes: usize) -> (String, bool) { - if s.len() <= max_bytes { - return (s.to_string(), false); + let tool = super::terminal_control_tool::TerminalControlTool::new(); + tool.call_impl(&input, context).await } - let head_n = max_bytes.saturating_sub(64); - let head = safe_str_slice(s, head_n); - let omitted = s.len().saturating_sub(head_n); - ( - format!("{}\n... [{} bytes omitted] ...\n", head, omitted), - true, - ) } /// Parse a leading `"[CODE] rest"` prefix produced by the front-end @@ -3462,345 +1189,6 @@ fn parse_hints_suffix(input: &str) -> (String, Vec) { } } -/// Slice `s` to ≤ `n` bytes without splitting a UTF-8 codepoint. -fn safe_str_slice(s: &str, n: usize) -> &str { - if n >= s.len() { - return s; - } - let mut cut = n; - while cut > 0 && !s.is_char_boundary(cut) { - cut -= 1; - } - &s[..cut] -} - -/// Read a short OS version string. Best-effort: returns `None` on platforms -/// where we can't determine it cheaply. -fn read_os_version() -> Option { - #[cfg(target_os = "macos")] - { - let out = std::process::Command::new("sw_vers") - .arg("-productVersion") - .output() - .ok()?; - let s = String::from_utf8_lossy(&out.stdout).trim().to_string(); - if s.is_empty() { - None - } else { - Some(format!("macOS {}", s)) - } - } - #[cfg(target_os = "windows")] - { - let out = std::process::Command::new("cmd") - .args(["/C", "ver"]) - .output() - .ok()?; - let s = String::from_utf8_lossy(&out.stdout).trim().to_string(); - if s.is_empty() { - None - } else { - Some(s) - } - } - #[cfg(target_os = "linux")] - { - // /etc/os-release is the canonical lookup. - let txt = std::fs::read_to_string("/etc/os-release").ok()?; - for line in txt.lines() { - if let Some(rest) = line.strip_prefix("PRETTY_NAME=") { - return Some(rest.trim_matches('"').to_string()); - } - } - None - } - #[cfg(not(any(target_os = "macos", target_os = "windows", target_os = "linux")))] - { - None - } -} - -fn hostname() -> std::io::Result { - // Prefer environment variables on each OS so we never have to spawn a - // subprocess for a value that's already in our address space, and so we - // never ingest a non-UTF-8 byte stream from `hostname.exe` on Windows - // running a CJK code page. - #[cfg(target_os = "windows")] - { - if let Ok(name) = std::env::var("COMPUTERNAME") { - if !name.is_empty() { - return Ok(name); - } - } - } - #[cfg(any(target_os = "linux", target_os = "macos"))] - { - if let Ok(name) = std::env::var("HOSTNAME") { - if !name.is_empty() { - return Ok(name); - } - } - if let Ok(bytes) = std::fs::read("/etc/hostname") { - let s = String::from_utf8_lossy(&bytes).trim().to_string(); - if !s.is_empty() { - return Ok(s); - } - } - } - let out = std::process::Command::new("hostname").output()?; - Ok(String::from_utf8_lossy(&out.stdout).trim().to_string()) -} - -/// Cheap PATH lookup for an executable name. Used to decide between e.g. -/// `pwsh` and `powershell`, or to surface a structured `NOT_AVAILABLE` -/// error when the requested interpreter isn't installed. -fn which_exists(name: &str) -> bool { - let paths = match std::env::var_os("PATH") { - Some(p) => p, - None => return false, - }; - let exts: Vec = if cfg!(target_os = "windows") { - std::env::var("PATHEXT") - .unwrap_or_else(|_| ".EXE;.BAT;.CMD;.COM".to_string()) - .split(';') - .map(|s| s.to_string()) - .collect() - } else { - vec![String::new()] - }; - for dir in std::env::split_paths(&paths) { - for ext in &exts { - let mut candidate = dir.join(name); - if !ext.is_empty() { - let stem = candidate.file_name().map(|n| n.to_os_string()); - if let Some(mut stem) = stem { - stem.push(ext); - candidate.set_file_name(stem); - } - } - if candidate.exists() { - return true; - } - } - } - false -} - -/// Build a `(program, args)` pair for invoking a PowerShell snippet on Windows -/// with UTF-8 output forced. Centralised so the "shell" alias and an explicit -/// `script_type='powershell'` produce the same encoding. -#[cfg(target_os = "windows")] -fn powershell_invocation(script: &str) -> (String, Vec) { - let prog = if which_exists("pwsh") { - "pwsh" - } else { - "powershell" - }; - ( - prog.to_string(), - vec![ - "-NoProfile".to_string(), - "-NonInteractive".to_string(), - "-Command".to_string(), - format!( - "[Console]::OutputEncoding=[Text.Encoding]::UTF8; {}", - script - ), - ], - ) -} - -/// Build OS-specific install hints for the clipboard helper. On Linux we -/// inspect the session type so the suggestion matches what the user actually -/// needs (Wayland users wasting time installing xclip is a real failure mode). -fn linux_clipboard_install_hints() -> Vec { - match std::env::consts::OS { - "linux" => { - #[cfg(target_os = "linux")] - { - let (server, _) = linux_session_info(); - match server.as_deref() { - Some("wayland") => vec![ - "Wayland session detected — install wl-clipboard (e.g. `sudo apt install wl-clipboard` / `sudo dnf install wl-clipboard`)".to_string(), - "Fallback for XWayland apps: also install xclip or xsel".to_string(), - ], - Some("x11") | Some("tty") => vec![ - "X11 session detected — install xclip (`sudo apt install xclip`) or xsel (`sudo apt install xsel`)".to_string(), - ], - _ => vec![ - "Install wl-clipboard (Wayland) OR xclip/xsel (X11). Run `echo $XDG_SESSION_TYPE` to know which one applies.".to_string(), - ], - } - } - #[cfg(not(target_os = "linux"))] - { - vec!["Install wl-clipboard (Wayland) or xclip/xsel (X11)".to_string()] - } - } - _ => vec!["Make sure the system clipboard helper is available on this host".to_string()], - } -} - -/// Best-effort detection of the Linux desktop session metadata (display -/// server + desktop environment). Returns `(display_server, desktop_env)`, -/// either of which may be `None` if the environment doesn't expose it. -#[cfg(target_os = "linux")] -fn linux_session_info() -> (Option, Option) { - let server = std::env::var("XDG_SESSION_TYPE") - .ok() - .filter(|s| !s.is_empty()); - let de = std::env::var("XDG_CURRENT_DESKTOP") - .ok() - .or_else(|| std::env::var("DESKTOP_SESSION").ok()) - .filter(|s| !s.is_empty()); - (server, de) -} - -/// Cross-platform clipboard read. Shells out to the canonical helper for -/// the current OS so we don't pull in a heavyweight dependency for what is -/// fundamentally a 1-line operation. Linux auto-detects Wayland → X11. -async fn clipboard_read() -> Result { - #[cfg(target_os = "macos")] - { - let out = tokio::process::Command::new("pbpaste") - .output() - .await - .map_err(|e| format!("spawn pbpaste: {}", e))?; - if !out.status.success() { - return Err(format!("pbpaste exit={:?}", out.status.code())); - } - Ok(String::from_utf8_lossy(&out.stdout).to_string()) - } - #[cfg(target_os = "windows")] - { - let out = tokio::process::Command::new("powershell") - .args(["-NoProfile", "-Command", "Get-Clipboard -Raw"]) - .output() - .await - .map_err(|e| format!("spawn powershell: {}", e))?; - if !out.status.success() { - return Err(format!("Get-Clipboard exit={:?}", out.status.code())); - } - // PowerShell appends CRLF; trim a single trailing newline so the - // returned text matches what the user actually copied. - let mut s = String::from_utf8_lossy(&out.stdout).to_string(); - if s.ends_with("\r\n") { - s.truncate(s.len() - 2); - } else if s.ends_with('\n') { - s.truncate(s.len() - 1); - } - Ok(s) - } - #[cfg(target_os = "linux")] - { - // Wayland first (modern session), then X11 fallbacks. - let candidates: &[(&str, &[&str])] = if std::env::var("WAYLAND_DISPLAY").is_ok() { - &[ - ("wl-paste", &["--no-newline"]), - ("xclip", &["-selection", "clipboard", "-o"]), - ("xsel", &["--clipboard", "--output"]), - ] - } else { - &[ - ("xclip", &["-selection", "clipboard", "-o"]), - ("xsel", &["--clipboard", "--output"]), - ("wl-paste", &["--no-newline"]), - ] - }; - for (bin, args) in candidates { - if let Ok(out) = tokio::process::Command::new(bin).args(*args).output().await { - if out.status.success() { - return Ok(String::from_utf8_lossy(&out.stdout).to_string()); - } - } - } - Err("no clipboard helper found (install wl-clipboard, xclip, or xsel)".to_string()) - } - #[cfg(not(any(target_os = "macos", target_os = "windows", target_os = "linux")))] - { - Err("clipboard not implemented for this OS".to_string()) - } -} - -/// Cross-platform clipboard write. Streams `text` into the helper's stdin -/// rather than embedding it in argv so newlines / quotes / shell metachars -/// are preserved verbatim. -async fn clipboard_write(text: &str) -> Result<(), String> { - use tokio::io::AsyncWriteExt; - - async fn pipe(bin: &str, args: &[&str], text: &str) -> Result<(), String> { - let mut child = tokio::process::Command::new(bin) - .args(args) - .stdin(std::process::Stdio::piped()) - .stdout(std::process::Stdio::null()) - .stderr(std::process::Stdio::piped()) - .spawn() - .map_err(|e| format!("spawn {}: {}", bin, e))?; - if let Some(mut stdin) = child.stdin.take() { - stdin - .write_all(text.as_bytes()) - .await - .map_err(|e| format!("write {} stdin: {}", bin, e))?; - } - let out = child - .wait_with_output() - .await - .map_err(|e| format!("wait {}: {}", bin, e))?; - if !out.status.success() { - return Err(format!("{} exit={:?}", bin, out.status.code())); - } - Ok(()) - } - - #[cfg(target_os = "macos")] - { - pipe("pbcopy", &[], text).await - } - #[cfg(target_os = "windows")] - { - // PowerShell's Set-Clipboard reads from the pipeline; pipe text in - // via stdin to preserve binary fidelity. - pipe( - "powershell", - &["-NoProfile", "-Command", "$input | Set-Clipboard"], - text, - ) - .await - } - #[cfg(target_os = "linux")] - { - let candidates: &[(&str, &[&str])] = if std::env::var("WAYLAND_DISPLAY").is_ok() { - &[ - ("wl-copy", &[]), - ("xclip", &["-selection", "clipboard"]), - ("xsel", &["--clipboard", "--input"]), - ] - } else { - &[ - ("xclip", &["-selection", "clipboard"]), - ("xsel", &["--clipboard", "--input"]), - ("wl-copy", &[]), - ] - }; - let mut last_err = String::new(); - for (bin, args) in candidates { - match pipe(bin, args, text).await { - Ok(()) => return Ok(()), - Err(e) => last_err = e, - } - } - Err(format!( - "no clipboard helper succeeded (install wl-clipboard, xclip, or xsel): {}", - last_err - )) - } - #[cfg(not(any(target_os = "macos", target_os = "windows", target_os = "linux")))] - { - let _ = text; - Err("clipboard not implemented for this OS".to_string()) - } -} - #[async_trait] impl Tool for ControlHubTool { fn name(&self) -> &str { @@ -3808,14 +1196,14 @@ impl Tool for ControlHubTool { } async fn description(&self) -> BitFunResult { - Ok(Self::description_text(Self::desktop_domain_enabled().await)) + Ok(Self::description_text()) } async fn description_with_context( &self, _context: Option<&ToolUseContext>, ) -> BitFunResult { - Ok(Self::description_text(Self::desktop_domain_enabled().await)) + Ok(Self::description_text()) } fn input_schema(&self) -> Value { @@ -3824,7 +1212,7 @@ impl Tool for ControlHubTool { "properties": { "domain": { "type": "string", - "enum": ["browser", "desktop", "terminal", "system", "meta"], + "enum": ["browser", "terminal", "meta"], "description": "The control domain to target." }, "action": { @@ -3920,7 +1308,7 @@ impl Tool for ControlHubTool { "?", action, ControlHubError::new(ErrorCode::InvalidParams, "Missing required field 'domain'.") - .with_hint("Set domain to one of: app, browser, desktop, terminal, system."), + .with_hint("Set domain to one of: browser, terminal, meta. Use ComputerUse for desktop/system actions."), )); } if action.is_empty() { @@ -4048,6 +1436,9 @@ fn map_dispatch_error(domain: &str, _action: &str, err: BitFunError) -> ControlH #[cfg(test)] mod control_hub_tests { use super::*; + use crate::agentic::tools::implementations::computer_use_actions::{ + linux_clipboard_install_hints, ComputerUseActions, + }; fn empty_context() -> ToolUseContext { ToolUseContext { @@ -4073,7 +1464,7 @@ mod control_hub_tests { .expect_err("unknown domain must error"); let msg = err.to_string(); assert!(msg.contains("Unknown domain"), "got: {msg}"); - for d in ["desktop", "browser", "terminal", "system", "meta"] { + for d in ["browser", "terminal", "meta", "ComputerUse"] { assert!( msg.contains(d), "valid domain {d} missing from error: {msg}" @@ -4091,22 +1482,14 @@ mod control_hub_tests { .expect("capabilities should succeed"); let payload = results.first().expect("one result").content(); let domains = payload.get("domains").expect("domains present"); - for d in ["desktop", "browser", "terminal", "system", "meta"] { + for d in ["browser", "terminal", "meta"] { assert!( domains.get(d).is_some(), "domain {d} missing from capabilities payload: {payload}" ); } - // Without a desktop host wired into the test context, desktop/app/terminal - // must report unavailable so the model doesn't waste turns calling them. - assert_eq!( - domains - .get("desktop") - .and_then(|v| v.get("available")) - .and_then(|v| v.as_bool()), - Some(false), - "desktop must be unavailable without a host" - ); + assert!(domains.get("desktop").is_none()); + assert!(domains.get("system").is_none()); assert_eq!( payload .get("host") @@ -4278,25 +1661,20 @@ mod control_hub_tests { } #[tokio::test] - async fn description_advertises_paste_as_canonical_text_input_when_desktop_available() { - // The full paste guidance is only embedded when the desktop domain is - // available in the current runtime. - if !ControlHubTool::desktop_domain_enabled().await { - return; - } + async fn description_points_desktop_and_system_work_to_computer_use() { let desc = ControlHubTool::new().description().await.unwrap(); assert!( - desc.contains("`paste"), - "description must call out `paste` as a first-class action" + desc.contains("ComputerUse"), + "description must point local computer work to ComputerUse" ); assert!( - desc.contains("PREFER") || desc.contains("prefer") || desc.contains("STRONGLY"), - "description must steer the model AWAY from type_text for non-trivial input" + !desc.contains("domain: \"desktop\"") && !desc.contains("domain: \"system\""), + "ControlHub description must not advertise desktop/system domains" ); } #[tokio::test] - async fn description_documents_two_browser_modes_and_forbids_desktop_browser_automation() { + async fn description_documents_two_browser_modes() { let desc = ControlHubTool::new().description().await.unwrap(); assert!( desc.contains("Two browser modes"), @@ -4306,53 +1684,31 @@ mod control_hub_tests { desc.contains("mode: \"headless\"") && desc.contains("mode: \"default\""), "description must mention both browser connect modes" ); - assert!( - desc.contains( - "Do **not** use `domain: \"desktop\"` mouse/keyboard actions to drive a browser." - ), - "description must explicitly forbid desktop browser automation" - ); } #[tokio::test] - async fn desktop_paste_without_host_returns_clean_error() { - // In unit tests there is no ComputerUseHost. Depending on whether the - // desktop domain is enabled for this runtime, dispatch either returns a - // structured NOT_AVAILABLE result envelope immediately, or reaches the - // host check and returns a tool error. Both are acceptable as long as - // the failure is clean and non-panicking. + async fn desktop_domain_returns_migration_error() { let tool = ControlHubTool::new(); let ctx = empty_context(); - let result = tool + let results = tool .dispatch( "desktop", "paste", &json!({ "text": "hi", "submit": true }), &ctx, ) - .await; - - match result { - Ok(results) => { - let payload = results.first().expect("one result").content(); - assert_eq!(payload.get("ok").and_then(|v| v.as_bool()), Some(false)); - assert_eq!( - payload - .get("error") - .and_then(|v| v.get("code")) - .and_then(|v| v.as_str()), - Some("NOT_AVAILABLE") - ); - } - Err(err) => { - assert!( - err.to_string().contains("Desktop control") - || err.to_string().contains("Computer Use"), - "expected desktop availability hint, got: {}", - err - ); - } - } + .await + .expect("migration error is a structured result"); + let payload = results.first().expect("one result").content(); + assert_eq!(payload.get("ok").and_then(|v| v.as_bool()), Some(false)); + assert_eq!( + payload + .get("error") + .and_then(|v| v.get("code")) + .and_then(|v| v.as_str()), + Some("INVALID_PARAMS") + ); + assert!(payload.to_string().contains("ComputerUse")); } #[tokio::test] @@ -4386,15 +1742,10 @@ mod control_hub_tests { #[tokio::test] async fn system_open_url_rejects_unsupported_scheme() { - let tool = ControlHubTool::new(); + let tool = ComputerUseActions::new(); let ctx = empty_context(); let results = tool - .dispatch( - "system", - "open_url", - &json!({ "url": "javascript:alert(1)" }), - &ctx, - ) + .handle_system("open_url", &json!({ "url": "javascript:alert(1)" }), &ctx) .await .expect("dispatch should succeed and return a structured error"); let payload: serde_json::Value = @@ -4405,11 +1756,10 @@ mod control_hub_tests { #[tokio::test] async fn system_open_file_returns_not_found_for_missing_path() { - let tool = ControlHubTool::new(); + let tool = ComputerUseActions::new(); let ctx = empty_context(); let results = tool - .dispatch( - "system", + .handle_system( "open_file", &json!({ "path": "/definitely/does/not/exist/bitfun-test.xyz" }), &ctx, @@ -4439,33 +1789,13 @@ mod control_hub_tests { "schema_version must be bumped to 1.1: {payload}" ); - // system.script_types must always include `shell`. - let script_types = payload - .get("domains") - .and_then(|d| d.get("system")) - .and_then(|s| s.get("script_types")) - .and_then(|v| v.as_array()) - .expect("system.script_types missing"); assert!( - script_types.iter().any(|s| s.as_str() == Some("shell")), - "script_types must include 'shell': {script_types:?}" + payload + .get("domains") + .and_then(|d| d.get("system")) + .is_none(), + "system must not be advertised by ControlHub capabilities: {payload}" ); - // On macOS we must additionally see applescript. - if cfg!(target_os = "macos") { - assert!( - script_types - .iter() - .any(|s| s.as_str() == Some("applescript")), - "macOS host must advertise applescript: {script_types:?}" - ); - } - // On Windows we must additionally see cmd. - if cfg!(target_os = "windows") { - assert!( - script_types.iter().any(|s| s.as_str() == Some("cmd")), - "Windows host must advertise cmd: {script_types:?}" - ); - } // browser.default_browser key must exist (value may be null on hosts // without any installed browser, but the field must be present so @@ -4482,10 +1812,10 @@ mod control_hub_tests { #[tokio::test] async fn system_get_os_info_includes_script_types() { - let tool = ControlHubTool::new(); + let tool = ComputerUseActions::new(); let ctx = empty_context(); let results = tool - .dispatch("system", "get_os_info", &json!({}), &ctx) + .handle_system("get_os_info", &json!({}), &ctx) .await .expect("get_os_info should succeed"); let payload = results.first().unwrap().content(); @@ -4504,11 +1834,10 @@ mod control_hub_tests { if cfg!(target_os = "macos") { return; // skip on macOS where applescript is genuinely available } - let tool = ControlHubTool::new(); + let tool = ComputerUseActions::new(); let ctx = empty_context(); let results = tool - .dispatch( - "system", + .handle_system( "run_script", &json!({ "script": "say hi", "script_type": "applescript" }), &ctx, @@ -4522,11 +1851,10 @@ mod control_hub_tests { #[tokio::test] async fn system_run_script_unknown_type_lists_valid_options() { - let tool = ControlHubTool::new(); + let tool = ComputerUseActions::new(); let ctx = empty_context(); let err = tool - .dispatch( - "system", + .handle_system( "run_script", &json!({ "script": "echo hi", "script_type": "ruby" }), &ctx, @@ -4567,7 +1895,7 @@ mod control_hub_tests { // the right interpreter and that we get UTF-8 stdout back. This // protects against the historical Windows GBK regression where // CJK output became `???`. - let tool = ControlHubTool::new(); + let tool = ComputerUseActions::new(); let ctx = empty_context(); let probe = if cfg!(target_os = "windows") { // PowerShell prints with the Unicode code page configured above. @@ -4576,8 +1904,7 @@ mod control_hub_tests { "echo hello-bitfun" }; let results = tool - .dispatch( - "system", + .handle_system( "run_script", &json!({ "script": probe, "script_type": "shell" }), &ctx, diff --git a/src/crates/core/src/agentic/tools/implementations/mod.rs b/src/crates/core/src/agentic/tools/implementations/mod.rs index 3b389d6f4..c8a07ce63 100644 --- a/src/crates/core/src/agentic/tools/implementations/mod.rs +++ b/src/crates/core/src/agentic/tools/implementations/mod.rs @@ -3,6 +3,7 @@ pub mod ask_user_question_tool; pub mod bash_tool; pub mod code_review_tool; +pub mod computer_use_actions; pub mod computer_use_input; pub mod computer_use_locate; pub mod computer_use_mouse_click_tool; diff --git a/src/crates/core/src/agentic/tools/registry.rs b/src/crates/core/src/agentic/tools/registry.rs index b68673871..984508a0d 100644 --- a/src/crates/core/src/agentic/tools/registry.rs +++ b/src/crates/core/src/agentic/tools/registry.rs @@ -147,13 +147,10 @@ impl ToolRegistry { // MiniApp Agent tool (single InitMiniApp) self.register_tool(Arc::new(InitMiniAppTool::new())); - // ControlHub — sole unified control entry point that aggregates ALL control - // capabilities (desktop, browser, terminal, system, meta) into a single - // tool. Legacy split control tools are intentionally NOT registered - // here: their implementations are kept internal where needed and reused by - // ControlHub, but the model only ever sees one control tool to eliminate - // cross-tool selection mistakes. + // ControlHub — unified browser/terminal/meta control entry point. + // Local desktop and OS/system Computer Use is exposed as a dedicated tool. self.register_tool(Arc::new(ControlHubTool::new())); + self.register_tool(Arc::new(ComputerUseTool::new())); // Playbook — predefined step-by-step operation guides for common tasks. self.register_tool(Arc::new(PlaybookTool::new())); @@ -205,19 +202,16 @@ mod tests { assert!(registry.get_tool("Cron").is_some()); } - /// Phase 0 contract: ControlHub is the sole control entry point. Legacy - /// split control tools must NOT be visible to the model; their - /// implementations are reused internally only. #[test] - fn registry_exposes_controlhub_only_for_control_capabilities() { + fn registry_exposes_controlhub_and_computer_use() { let registry = create_tool_registry(); assert!( registry.get_tool("ControlHub").is_some(), - "ControlHub must be registered as the unified control tool" + "ControlHub must remain registered for browser/terminal/meta control" ); assert!( - registry.get_tool("ComputerUse").is_none(), - "Legacy split control tools must remain hidden (Phase 0 dedup)" + registry.get_tool("ComputerUse").is_some(), + "ComputerUse must be registered as the dedicated desktop automation tool" ); } diff --git a/src/web-ui/src/app/scenes/agents/AgentsScene.tsx b/src/web-ui/src/app/scenes/agents/AgentsScene.tsx index dfca9ab0f..d4f12c966 100644 --- a/src/web-ui/src/app/scenes/agents/AgentsScene.tsx +++ b/src/web-ui/src/app/scenes/agents/AgentsScene.tsx @@ -220,6 +220,11 @@ const AgentsHomeView: React.FC = () => { accentColor: '#14b8a6', accentBg: 'rgba(20,184,166,0.10)', }, + ComputerUse: { + role: t('coreAgentsZone.modes.computerUse.role'), + accentColor: '#f59e0b', + accentBg: 'rgba(245,158,11,0.10)', + }, }), [t]); const coreAgents = useMemo(() => allAgents.filter((agent) => CORE_AGENT_IDS.has(agent.id)), [allAgents]); diff --git a/src/web-ui/src/app/scenes/agents/agentVisibility.ts b/src/web-ui/src/app/scenes/agents/agentVisibility.ts index 0b9ca30a8..4b8c6ef24 100644 --- a/src/web-ui/src/app/scenes/agents/agentVisibility.ts +++ b/src/web-ui/src/app/scenes/agents/agentVisibility.ts @@ -2,7 +2,7 @@ export const HIDDEN_AGENT_IDS = new Set(['Claw']); /** Core mode agents shown in the top zone only; excluded from overview zone list and counts. */ -export const CORE_AGENT_IDS = new Set(['agentic', 'Cowork']); +export const CORE_AGENT_IDS = new Set(['agentic', 'Cowork', 'ComputerUse']); /** Agents that appear in the bottom overview grid (same pool as filter chip counts). */ export function isAgentInOverviewZone(agent: { id: string }): boolean { diff --git a/src/web-ui/src/app/scenes/agents/utils.ts b/src/web-ui/src/app/scenes/agents/utils.ts index 4eee4045d..760126fa1 100644 --- a/src/web-ui/src/app/scenes/agents/utils.ts +++ b/src/web-ui/src/app/scenes/agents/utils.ts @@ -64,6 +64,7 @@ function enrichCapabilities(agent: AgentWithCapabilities): AgentWithCapabilities if (id === 'plan') return { ...agent, capabilities: [{ category: 'analysis', level: 5 }, { category: 'docs', level: 3 }] }; if (id === 'debug') return { ...agent, capabilities: [{ category: 'coding', level: 5 }, { category: 'analysis', level: 3 }] }; if (id === 'cowork') return { ...agent, capabilities: [{ category: 'analysis', level: 4 }, { category: 'creative', level: 3 }] }; + if (id === 'computeruse') return { ...agent, capabilities: [{ category: 'ops', level: 5 }, { category: 'analysis', level: 3 }] }; if (id === 'deepresearch') return { ...agent, capabilities: [{ category: 'analysis', level: 5 }, { category: 'docs', level: 4 }] }; } diff --git a/src/web-ui/src/locales/en-US/flow-chat.json b/src/web-ui/src/locales/en-US/flow-chat.json index 66694f202..a1856ecaa 100644 --- a/src/web-ui/src/locales/en-US/flow-chat.json +++ b/src/web-ui/src/locales/en-US/flow-chat.json @@ -254,6 +254,7 @@ "Plan": "Plan first, execute later — clarify requirements and create an implementation plan before coding", "debug": "Evidence-driven systematic debugging: form hypotheses, gather runtime evidence, and fix with confidence", "Cowork": "Collaborative mode: clarify first, track progress lightly, verify outcomes anytime", + "ComputerUse": "Dedicated desktop automation agent for perceiving and operating apps, browsers, and OS UI", "DeepResearch": "Deep research with parallel sub-agents: dispatches multiple agents to investigate concurrently, producing comprehensive sourced reports", "Team": "Virtual engineering team: CEO, Eng Manager, Designer, QA Lead, Security Officer, Release Engineer — orchestrated through a full sprint workflow" }, @@ -263,6 +264,7 @@ "Plan": "Plan", "debug": "Debug", "Cowork": "Cowork", + "ComputerUse": "Computer Use", "DeepResearch": "Deep Research", "Team": "Team" }, diff --git a/src/web-ui/src/locales/en-US/scenes/agents.json b/src/web-ui/src/locales/en-US/scenes/agents.json index 6a2d89abf..59c1bf9b5 100644 --- a/src/web-ui/src/locales/en-US/scenes/agents.json +++ b/src/web-ui/src/locales/en-US/scenes/agents.json @@ -28,12 +28,19 @@ }, "coreAgentsZone": { "title": "Core Agents", - "subtitle": "Built-in core agent modes covering mainstream AI workflows, ready to use out of the box.", + "subtitle": "Built-in core agents covering mainstream AI workflows, ready to use out of the box.", "empty": "No core agents detected", "roleLabel": "Primary Use · ", "modes": { - "agentic": { "role": "Coding specialist agent" }, - "cowork": { "role": "Office & collaboration agent" } + "agentic": { + "role": "Coding specialist agent" + }, + "cowork": { + "role": "Office & collaboration agent" + }, + "computerUse": { + "role": "Desktop automation agent" + } } }, "agentsZone": { diff --git a/src/web-ui/src/locales/zh-CN/flow-chat.json b/src/web-ui/src/locales/zh-CN/flow-chat.json index d1448ffaa..4670248c7 100644 --- a/src/web-ui/src/locales/zh-CN/flow-chat.json +++ b/src/web-ui/src/locales/zh-CN/flow-chat.json @@ -254,6 +254,7 @@ "Plan": "先规划后执行,先明确需求并制定实施计划,再进行编码", "debug": "证据驱动的系统化调试:提出假设、收集运行时证据、精准定位并修复问题", "Cowork": "协作模式:先澄清再推进,轻量跟踪进度,随时验证结果", + "ComputerUse": "独立电脑操作智能体:感知本机环境,操作应用、浏览器与系统界面", "DeepResearch": "深度研究:并行派发多个子 Agent 同时调研不同章节,快速生成高质量研究报告", "Team": "虚拟工程团队:CEO、工程经理、设计师、QA 负责人、安全官、发布工程师 — 按完整冲刺流程协同工作" }, @@ -263,6 +264,7 @@ "Plan": "Plan", "debug": "Debug", "Cowork": "Cowork", + "ComputerUse": "Computer Use", "DeepResearch": "Deep Research", "Team": "Team" }, diff --git a/src/web-ui/src/locales/zh-CN/scenes/agents.json b/src/web-ui/src/locales/zh-CN/scenes/agents.json index e3bdeead4..008c93db3 100644 --- a/src/web-ui/src/locales/zh-CN/scenes/agents.json +++ b/src/web-ui/src/locales/zh-CN/scenes/agents.json @@ -28,12 +28,19 @@ }, "coreAgentsZone": { "title": "核心智能体", - "subtitle": "平台内置的核心 Agent 模式,覆盖主流 AI 工作流,开箱即用。", + "subtitle": "平台内置的核心 Agent,覆盖主流 AI 工作流,开箱即用。", "empty": "暂未检测到核心智能体", "roleLabel": "主要应用 · ", "modes": { - "agentic": { "role": "编码专业智能体" }, - "cowork": { "role": "办公智能体" } + "agentic": { + "role": "编码专业智能体" + }, + "cowork": { + "role": "办公智能体" + }, + "computerUse": { + "role": "电脑操作智能体" + } } }, "agentsZone": { diff --git a/src/web-ui/src/locales/zh-TW/flow-chat.json b/src/web-ui/src/locales/zh-TW/flow-chat.json index fe00aad60..370f1ec0a 100644 --- a/src/web-ui/src/locales/zh-TW/flow-chat.json +++ b/src/web-ui/src/locales/zh-TW/flow-chat.json @@ -245,6 +245,7 @@ "Plan": "先規劃後執行,先明確需求並制定實施計劃,再進行編碼", "debug": "證據驅動的系統化調試:提出假設、收集運行時證據、精準定位並修復問題", "Cowork": "協作模式:先澄清再推進,輕量跟蹤進度,隨時驗證結果", + "ComputerUse": "獨立電腦操作智能體:感知本機環境,操作應用、瀏覽器與系統介面", "DeepResearch": "深度研究:並行派發多個子 Agent 同時調研不同章節,快速生成高質量研究報告", "Team": "虛擬工程團隊:CEO、工程經理、設計師、QA 負責人、安全官、發佈工程師 — 按完整衝刺流程協同工作" }, @@ -254,6 +255,7 @@ "Plan": "Plan", "debug": "Debug", "Cowork": "Cowork", + "ComputerUse": "Computer Use", "DeepResearch": "Deep Research", "Team": "Team" }, diff --git a/src/web-ui/src/locales/zh-TW/scenes/agents.json b/src/web-ui/src/locales/zh-TW/scenes/agents.json index b3ce4a56a..217c52f5e 100644 --- a/src/web-ui/src/locales/zh-TW/scenes/agents.json +++ b/src/web-ui/src/locales/zh-TW/scenes/agents.json @@ -28,12 +28,19 @@ }, "coreAgentsZone": { "title": "核心智能體", - "subtitle": "平臺內置的核心 Agent 模式,覆蓋主流 AI 工作流,開箱即用。", + "subtitle": "平臺內置的核心 Agent,覆蓋主流 AI 工作流,開箱即用。", "empty": "暫未檢測到核心智能體", "roleLabel": "主要應用 · ", "modes": { - "agentic": { "role": "編碼專業智能體" }, - "cowork": { "role": "辦公智能體" } + "agentic": { + "role": "編碼專業智能體" + }, + "cowork": { + "role": "辦公智能體" + }, + "computerUse": { + "role": "電腦操作智能體" + } } }, "agentsZone": {