diff --git a/Cargo.toml b/Cargo.toml index d05b5b783..ddb6600ff 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -121,6 +121,7 @@ unic-langid = "0.9" x25519-dalek = { version = "2.0", features = ["static_secrets"] } aes-gcm = "0.10" sha2 = "0.10" +sha1 = "0.10" rand = "0.8" # Device/Network info (Remote Connect) diff --git a/src/apps/desktop/Cargo.toml b/src/apps/desktop/Cargo.toml index 99e5e8d6a..e42f69dbf 100644 --- a/src/apps/desktop/Cargo.toml +++ b/src/apps/desktop/Cargo.toml @@ -49,6 +49,7 @@ reqwest = { workspace = true } thiserror = "1.0" futures = { workspace = true } async-trait = { workspace = true } +sha1 = { workspace = true } screenshots = "0.8" enigo = "0.2" image = { version = "0.24", default-features = false, features = ["png", "jpeg"] } @@ -56,7 +57,7 @@ resvg = { version = "0.47.0", default-features = false } [target.'cfg(target_os = "macos")'.dependencies] core-foundation = "0.9" -core-graphics = "0.23" +core-graphics = { version = "0.23", features = ["elcapitan", "highsierra"] } dispatch = "0.2" objc2 = { version = "0.6", features = ["exception"] } objc2-foundation = "0.3" diff --git a/src/apps/desktop/src/computer_use/desktop_host.rs b/src/apps/desktop/src/computer_use/desktop_host.rs index a978beb2a..b0112f497 100644 --- a/src/apps/desktop/src/computer_use/desktop_host.rs +++ b/src/apps/desktop/src/computer_use/desktop_host.rs @@ -2,13 +2,17 @@ use async_trait::async_trait; use bitfun_core::agentic::tools::computer_use_host::{ - clamp_point_crop_half_extent, ActionRecord, ComputerScreenshot, ComputerUseDisplayInfo, - ComputerUseHost, ComputerUseImageContentRect, ComputerUseImplicitScreenshotCenter, - ComputerUseInteractionScreenshotKind, ComputerUseInteractionState, ComputerUseLastMutationKind, - ComputerUseNavigateQuadrant, ComputerUseNavigationRect, ComputerUsePermissionSnapshot, - ComputerUseScreenshotParams, ComputerUseScreenshotRefinement, ComputerUseSessionSnapshot, - LoopDetectionResult, OcrRegionNative, ScreenshotCropCenter, UiElementLocateQuery, - UiElementLocateResult, COMPUTER_USE_QUADRANT_CLICK_READY_MAX_LONG_EDGE, + clamp_point_crop_half_extent, ActionRecord, AppClickParams, AppInfo, AppSelector, + AppStateSnapshot, AppWaitPredicate, ClickTarget, ComputerScreenshot, ComputerUseDisplayInfo, + ComputerUseHost, ComputerUseImageContentRect, ComputerUseImageGlobalBounds, + ComputerUseImplicitScreenshotCenter, ComputerUseInteractionScreenshotKind, + ComputerUseInteractionState, ComputerUseLastMutationKind, ComputerUseNavigateQuadrant, + ComputerUseNavigationRect, ComputerUsePermissionSnapshot, ComputerUseScreenshotParams, + ComputerUseScreenshotRefinement, ComputerUseSessionSnapshot, InteractiveActionResult, + InteractiveClickParams, InteractiveScrollParams, InteractiveTypeTextParams, InteractiveView, + InteractiveViewOpts, LoopDetectionResult, OcrRegionNative, ScreenshotCropCenter, + UiElementLocateQuery, UiElementLocateResult, VisualActionResult, VisualClickParams, VisualMark, + VisualMarkView, VisualMarkViewOpts, COMPUTER_USE_QUADRANT_CLICK_READY_MAX_LONG_EDGE, COMPUTER_USE_QUADRANT_EDGE_EXPAND_PX, }; #[cfg(any(target_os = "macos", target_os = "windows"))] @@ -20,11 +24,12 @@ use bitfun_core::util::errors::{BitFunError, BitFunResult}; use enigo::{Axis, Button, Coordinate, Direction, Enigo, Key, Keyboard, Mouse, Settings}; use image::codecs::jpeg::JpegEncoder; use image::{DynamicImage, Rgb, RgbImage}; -use log::{debug, warn}; +use log::{debug, info, warn}; use resvg::tiny_skia::{Pixmap, Transform}; use resvg::usvg; use screenshots::display_info::DisplayInfo; use screenshots::Screen; +use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::{Mutex, OnceLock}; use std::time::{Duration, Instant}; @@ -57,6 +62,7 @@ struct PointerPixmapCache { } static POINTER_PIXMAP_CACHE: OnceLock> = OnceLock::new(); +static SCREENSHOT_ID_COUNTER: AtomicU64 = AtomicU64::new(1); fn pointer_pixmap_cache() -> Option<&'static PointerPixmapCache> { POINTER_PIXMAP_CACHE @@ -126,6 +132,75 @@ fn blend_pointer_pixmap(img: &mut RgbImage, cx: i32, cy: i32, p: &PointerPixmapC } } +#[cfg(test)] +mod visual_grid_tests { + use super::*; + + #[test] + fn detects_regular_grid_rect_from_synthetic_screenshot() { + let mut img = RgbImage::from_pixel(420, 360, Rgb([245, 245, 245])); + let left = 60u32; + let top = 40u32; + let size = 280u32; + for i in 0..15u32 { + let pos = i * (size - 1) / 14; + for d in 0..2 { + let x = left + pos + d; + if x < left + size { + for y in top..top + size { + img.put_pixel(x, y, Rgb([25, 25, 25])); + } + } + let y = top + pos + d; + if y < top + size { + for x in left..left + size { + img.put_pixel(x, y, Rgb([25, 25, 25])); + } + } + } + } + + let mut bytes = Vec::new(); + JpegEncoder::new_with_quality(&mut bytes, 92) + .encode_image(&DynamicImage::ImageRgb8(img)) + .expect("encode synthetic grid"); + let shot = ComputerScreenshot { + screenshot_id: Some("test-shot".to_string()), + bytes, + mime_type: "image/jpeg".to_string(), + image_width: 420, + image_height: 360, + native_width: 420, + native_height: 360, + display_origin_x: 0, + display_origin_y: 0, + vision_scale: 1.0, + pointer_image_x: None, + pointer_image_y: None, + screenshot_crop_center: None, + point_crop_half_extent_native: None, + navigation_native_rect: None, + quadrant_navigation_click_ready: false, + image_content_rect: Some(ComputerUseImageContentRect { + left: 0, + top: 0, + width: 420, + height: 360, + }), + image_global_bounds: None, + ui_tree_text: None, + implicit_confirmation_crop_applied: false, + }; + + let (x0, y0, width, height) = + detect_regular_grid_rect_from_screenshot(&shot, 15, 15).expect("detect grid"); + assert!((x0 - left as i32).abs() <= 6, "x0={x0}"); + assert!((y0 - top as i32).abs() <= 6, "y0={y0}"); + assert!((width as i32 - size as i32).abs() <= 12, "width={width}"); + assert!((height as i32 - size as i32).abs() <= 12, "height={height}"); + } +} + fn draw_pointer_fallback_cross(img: &mut RgbImage, cx: i32, cy: i32) { const ARM: i32 = 2; const OUTLINE: Rgb = Rgb([255, 255, 255]); @@ -551,6 +626,25 @@ impl PointerMap { let gy = self.origin_y as f64 + ty * (nh - 1.0).max(0.0) + 0.5; Ok((gx, gy)) } + + fn image_global_bounds(&self) -> Option { + if self.image_w == 0 || self.image_h == 0 { + return None; + } + let (x0, y0) = self.map_image_to_global_f64(0, 0).ok()?; + let (x1, y1) = self + .map_image_to_global_f64( + self.image_w.saturating_sub(1) as i32, + self.image_h.saturating_sub(1) as i32, + ) + .ok()?; + Some(ComputerUseImageGlobalBounds { + left: x0.min(x1), + top: y0.min(y1), + width: (x1 - x0).abs(), + height: (y1 - y0).abs(), + }) + } } /// What the last tool `screenshot` implied for **plain** follow-up captures (no crop / no `navigate_quadrant`). @@ -599,6 +693,34 @@ struct ComputerUseSessionMutableState { /// display instead of "screen under the mouse pointer". The model /// uses this to disambiguate multi-monitor targets explicitly. preferred_display_id: Option, + /// Most-recent Set-of-Mark interactive view per pid. Used to resolve + /// `interactive_*` numeric `i` indices back to AX node indices and to + /// detect stale-view usage via `before_view_digest`. + interactive_view_cache: std::collections::HashMap, + visual_mark_cache: std::collections::HashMap, + /// Most-recent focused-window screenshot coordinate map per application + /// pid. `app_click(target: image_xy | image_grid)` must use the same + /// image basis the model saw from `get_app_state`, not whichever global + /// computer-use screenshot happened to run last. + app_pointer_maps: std::collections::HashMap, + /// Exact screenshot-id keyed coordinate maps. This is the strongest + /// addressing basis for arbitrary visual targets because it survives + /// interleaved app_state / screenshot / interactive_view calls. + screenshot_pointer_maps: std::collections::HashMap, +} + +#[derive(Debug, Clone)] +struct CachedInteractiveView { + digest: String, + /// `i` → `node_idx` map (dense, indexed by `i`). + elements: Vec, +} + +#[derive(Debug, Clone)] +struct CachedVisualMarkView { + digest: String, + marks: Vec, + screenshot_id: Option, } impl ComputerUseSessionMutableState { @@ -615,6 +737,10 @@ impl ComputerUseSessionMutableState { optimizer: ComputerUseOptimizer::new(), last_mutation_kind: None, preferred_display_id: None, + interactive_view_cache: std::collections::HashMap::new(), + visual_mark_cache: std::collections::HashMap::new(), + app_pointer_maps: std::collections::HashMap::new(), + screenshot_pointer_maps: std::collections::HashMap::new(), } } @@ -681,8 +807,68 @@ impl Default for DesktopComputerUseHost { impl DesktopComputerUseHost { pub fn new() -> Self { - Self { + let host = Self { state: Mutex::new(ComputerUseSessionMutableState::new()), + }; + host.run_background_input_self_check(); + host + } + + fn next_screenshot_id() -> String { + let seq = SCREENSHOT_ID_COUNTER.fetch_add(1, Ordering::Relaxed); + let ms = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map(|d| d.as_millis()) + .unwrap_or(0); + format!("shot_{}_{}", ms, seq) + } + + /// Codex-style startup probe: log whether AX/background-input capabilities + /// are available so operators can diagnose missing permissions early. + /// + /// Behaviour parity with Codex: if the process is NOT yet + /// Accessibility-trusted, immediately call + /// `AXIsProcessTrustedWithOptions({kAXTrustedCheckOptionPrompt: true})` + /// once. macOS responds by surfacing the system-modal "允许 X 通过辅助功能 + /// 控制您的电脑" dialog (deep-linked to System Settings → Privacy & Security + /// → Accessibility). Without this call, the OS NEVER prompts and AX tree + /// reads against other apps return only the top-level window structure + /// (root window + a few descendants) — which is exactly the "shallow tree + /// / agent goes blind" symptom we observed against the BitFun WebView. + fn run_background_input_self_check(&self) { + #[cfg(target_os = "macos")] + { + let bg_ok = crate::computer_use::macos_bg_input::supports_background_input(); + if bg_ok { + log::info!( + "AX-first computer use ready: AXIsProcessTrustedWithOptions=true; CGEventPostToPid background input enabled" + ); + } else { + log::warn!( + "AX-first computer use disabled: process is NOT marked Accessibility-trusted. Triggering one-shot system prompt via AXIsProcessTrustedWithOptions(prompt:true) so macOS surfaces the Accessibility permission dialog (deep-link: System Settings → Privacy & Security → Accessibility)." + ); + // Fire-and-forget. The dialog is async and modal at the macOS + // level; we do not block startup waiting for the user to + // approve. The next CU invocation will simply succeed once + // permission lands. Subsequent BitFun launches skip the + // prompt because `ax_trusted()` will already be true. + macos::request_ax_prompt(); + } + // Same idea for Screen Recording. Without it, focused-window + // screenshots fall back to a desktop-wallpaper placeholder, which + // is the second half of the "blind agent" failure mode. + if !macos::screen_capture_preflight() { + log::warn!( + "Screen Recording permission missing; window screenshots will be incomplete. Triggering CGRequestScreenCaptureAccess() to surface the system prompt." + ); + let _ = macos::request_screen_capture(); + } + } + #[cfg(not(target_os = "macos"))] + { + log::info!( + "AX-first background input is macOS-only in this build; legacy screen-coordinate desktop actions remain available" + ); } } @@ -1063,6 +1249,7 @@ end tell"#]) let iw = rgb.width(); let ih = rgb.height(); Ok(ComputerScreenshot { + screenshot_id: Some(Self::next_screenshot_id()), bytes: jpeg_bytes, mime_type: "image/jpeg".to_string(), image_width: iw, @@ -1084,6 +1271,12 @@ end tell"#]) width: iw, height: ih, }), + image_global_bounds: Some(ComputerUseImageGlobalBounds { + left: display_origin_x as f64, + top: display_origin_y as f64, + width: native_w as f64, + height: native_h as f64, + }), implicit_confirmation_crop_applied: false, ui_tree_text: None, }) @@ -1571,27 +1764,6 @@ end tell"#]) .crop_center .map(|_| clamp_point_crop_half_extent(params.point_crop_half_extent_native)); - let shot = ComputerScreenshot { - bytes: jpeg_bytes, - mime_type: "image/jpeg".to_string(), - image_width: image_w, - image_height: image_h, - native_width: map_native_w, - native_height: map_native_h, - display_origin_x: map_origin_x, - display_origin_y: map_origin_y, - vision_scale, - pointer_image_x, - pointer_image_y, - screenshot_crop_center, - point_crop_half_extent_native, - navigation_native_rect: shot_navigation_rect, - quadrant_navigation_click_ready, - image_content_rect: Some(image_content_rect), - implicit_confirmation_crop_applied, - ui_tree_text, - }; - #[cfg(target_os = "macos")] let map = PointerMap { image_w, @@ -1619,6 +1791,31 @@ end tell"#]) origin_x: map_origin_x, origin_y: map_origin_y, }; + let image_global_bounds = map.image_global_bounds(); + + let screenshot_id = Self::next_screenshot_id(); + let shot = ComputerScreenshot { + screenshot_id: Some(screenshot_id), + bytes: jpeg_bytes, + mime_type: "image/jpeg".to_string(), + image_width: image_w, + image_height: image_h, + native_width: map_native_w, + native_height: map_native_h, + display_origin_x: map_origin_x, + display_origin_y: map_origin_y, + vision_scale, + pointer_image_x, + pointer_image_y, + screenshot_crop_center, + point_crop_half_extent_native, + navigation_native_rect: shot_navigation_rect, + quadrant_navigation_click_ready, + image_content_rect: Some(image_content_rect), + image_global_bounds, + implicit_confirmation_crop_applied, + ui_tree_text, + }; Ok((shot, map, persist_nav_focus)) } @@ -1946,7 +2143,7 @@ mod macos { F: FnOnce() -> BitFunResult + Send, T: Send, { - let work = move || catch_objc_in_main_queue(f); + let work = move || catch_only(f); unsafe { if pthread_main_np() != 0 { work() @@ -1956,20 +2153,53 @@ mod macos { } } - /// Run a closure under an Objective-C `@try/@catch` and convert any - /// `NSException` into a `BitFunError`. Used to wrap calls into AppKit / - /// HIToolbox / Accessibility APIs that may throw native exceptions which - /// would otherwise propagate as `__rust_foreign_exception` and abort the - /// process. Public so non-enigo paths (e.g. AX window-bounds lookup) can - /// share the same defense. + /// Run a closure on the main dispatch queue under an Objective-C + /// `@try/@catch`. This is the correct wrapper for calls that may reach + /// AppKit / HIToolbox / Accessibility code paths from a background + /// (`tokio::spawn_blocking`) worker thread. + /// + /// Two failure modes are defended against simultaneously: + /// + /// 1. `NSException` thrown by the framework (caught and converted into + /// `BitFunError`). + /// 2. AppKit's `__assert_rtn` "Must only be used from the main thread" + /// `SIGTRAP` which fires when AX cross-process callbacks (e.g. + /// `AXUIElementCopyActionNames` → `_NSThemeWidgetCell.accessibility…` + /// → `_WMWindow performUpdatesUsingBlock:`) are evaluated off the + /// main thread. `objc2::exception::catch` cannot intercept this + /// trap; the only fix is to actually run the closure on the main + /// thread, which is what this helper does. + /// + /// If we're already on the main thread we run inline (avoids + /// `dispatch_sync(main)` deadlock). pub fn catch_objc(f: F) -> BitFunResult + where + F: FnOnce() -> BitFunResult + Send, + T: Send, + { + unsafe { + let on_main = pthread_main_np() != 0; + if on_main { + catch_only(f) + } else { + Queue::main().exec_sync(move || catch_only(f)) + } + } + } + + /// Run a closure under an Objective-C `@try/@catch` **on the current + /// thread** (no main-queue dispatch). Use this for closures that borrow + /// non-`Send` data and that are guaranteed not to reach AppKit's + /// main-thread-only AX callbacks (e.g. Vision OCR on an in-memory + /// screenshot buffer). + pub fn catch_objc_local(f: F) -> BitFunResult where F: FnOnce() -> BitFunResult, { - catch_objc_in_main_queue(f) + catch_only(f) } - fn catch_objc_in_main_queue(f: F) -> BitFunResult + fn catch_only(f: F) -> BitFunResult where F: FnOnce() -> BitFunResult, { @@ -2060,6 +2290,77 @@ impl DesktopComputerUseHost { ComputerUseHost::computer_use_after_click(self); Ok(()) } + + fn map_app_image_coords_to_pointer_f64( + &self, + pid: i32, + x: i32, + y: i32, + screenshot_id: Option<&str>, + ) -> BitFunResult<(f64, f64)> { + let map = { + let s = self + .state + .lock() + .map_err(|e| BitFunError::tool(format!("lock: {}", e)))?; + screenshot_id + .and_then(|id| s.screenshot_pointer_maps.get(id).copied()) + .or_else(|| s.app_pointer_maps.get(&pid).copied()) + .or(s.pointer_map) + }; + let Some(map) = map else { + return Err(BitFunError::tool( + "No screenshot coordinate map is available for this app. Call desktop.get_app_state for the target app first, then use app_click image_xy/image_grid against that returned screenshot_id.".to_string(), + )); + }; + map.map_image_to_global_f64(x, y) + } + + fn image_grid_target_to_xy(target: &ClickTarget) -> BitFunResult> { + let ClickTarget::ImageGrid { + x0, + y0, + width, + height, + rows, + cols, + row, + col, + intersections, + .. + } = target + else { + return Ok(None); + }; + + if *width == 0 || *height == 0 || *rows == 0 || *cols == 0 { + return Err(BitFunError::tool( + "image_grid requires positive width, height, rows, and cols.".to_string(), + )); + } + if row >= rows || col >= cols { + return Err(BitFunError::tool(format!( + "image_grid row/col out of range: row={} col={} for rows={} cols={}", + row, col, rows, cols + ))); + } + + let (fx, fy) = if *intersections { + let denom_x = cols.saturating_sub(1).max(1) as f64; + let denom_y = rows.saturating_sub(1).max(1) as f64; + ( + *x0 as f64 + (*col as f64 * width.saturating_sub(1) as f64 / denom_x), + *y0 as f64 + (*row as f64 * height.saturating_sub(1) as f64 / denom_y), + ) + } else { + ( + *x0 as f64 + ((*col as f64 + 0.5) * *width as f64 / *cols as f64), + *y0 as f64 + ((*row as f64 + 0.5) * *height as f64 / *rows as f64), + ) + }; + + Ok(Some((fx.round() as i32, fy.round() as i32))) + } } /// Draw a transient red highlight circle at `(gx, gy)` in CoreGraphics global coordinates (macOS). @@ -2103,68 +2404,265 @@ fn flash_click_highlight_cg(gx: f64, gy: f64) { }); } -#[async_trait] -impl ComputerUseHost for DesktopComputerUseHost { - async fn permission_snapshot(&self) -> BitFunResult { - Ok(tokio::task::spawn_blocking(Self::permission_sync) - .await - .map_err(|e| BitFunError::tool(e.to_string()))?) - } +impl DesktopComputerUseHost { + #[cfg(target_os = "macos")] + async fn screenshot_for_app_pid(&self, pid: i32) -> BitFunResult { + let window_target_rect = macos::catch_objc(|| { + crate::computer_use::macos_ax_ui::window_bounds_global_for_pid(pid) + }) + .ok() + .map(|(x, y, w, h)| (x as f64, y as f64, w as f64, h as f64)); - fn computer_use_interaction_state(&self) -> ComputerUseInteractionState { - let (last_ref, click_needs_fresh, pending_verify, last_mutation, preferred_display_id) = { - let s = self.state.lock().unwrap(); - ( - s.last_shot_refinement, - s.click_needs_fresh_screenshot, - s.pending_verify_screenshot, - s.last_mutation_kind.clone(), - s.preferred_display_id, - ) + let (cached, preferred_display_id) = { + let s = self + .state + .lock() + .map_err(|e| BitFunError::tool(format!("lock: {}", e)))?; + (s.screenshot_cache.clone(), s.preferred_display_id) }; - let (mouse_x, mouse_y) = Self::current_mouse_position(); - let displays = Self::enumerate_displays(preferred_display_id, mouse_x, mouse_y); - let active_display_id = preferred_display_id.or_else(|| { - displays - .iter() - .find(|d| d.has_pointer) - .map(|d| d.display_id) - .or_else(|| displays.iter().find(|d| d.is_primary).map(|d| d.display_id)) - }); + let effective_pref_display_id = if let Some((wx, wy, ww, wh)) = window_target_rect { + let cx_g = wx + ww / 2.0; + let cy_g = wy + wh / 2.0; + Screen::from_point(cx_g.round() as i32, cy_g.round() as i32) + .ok() + .map(|s| s.display_info.id) + .or(preferred_display_id) + } else { + preferred_display_id + }; - let (click_ready, screenshot_kind, mut recommended_next_action) = - match last_ref { - Some(ComputerUseScreenshotRefinement::RegionAroundPoint { .. }) => ( - !click_needs_fresh, - Some(ComputerUseInteractionScreenshotKind::RegionCrop), - None, - ), - Some(ComputerUseScreenshotRefinement::QuadrantNavigation { - click_ready, .. - }) if click_ready => ( - !click_needs_fresh, - Some(ComputerUseInteractionScreenshotKind::QuadrantTerminal), - None, - ), - Some(ComputerUseScreenshotRefinement::QuadrantNavigation { .. }) => ( - false, - Some(ComputerUseInteractionScreenshotKind::QuadrantDrill), - Some("screenshot_navigate_quadrant_until_click_ready".to_string()), - ), - Some(ComputerUseScreenshotRefinement::FullDisplay) => ( - !click_needs_fresh, - Some(ComputerUseInteractionScreenshotKind::FullDisplay), - if click_needs_fresh { - Some("screenshot".to_string()) - } else { - None - }, - ), - None => (false, None, Some("screenshot".to_string())), + let (rgba, screen) = + Self::resolve_screenshot_capture(cached, mouse_x, mouse_y, effective_pref_display_id)?; + let (native_w, native_h) = rgba.dimensions(); + let params = if let Some((wx, wy, ww, wh)) = window_target_rect { + let cx_g = wx + ww / 2.0; + let cy_g = wy + wh / 2.0; + let (cx, cy) = global_to_native_full_pixel_center( + cx_g, + cy_g, + native_w, + native_h, + &screen.display_info, + ); + let disp_w = screen.display_info.width as f64; + let disp_h = screen.display_info.height as f64; + let scale_x = if disp_w > 0.0 { + native_w as f64 / disp_w + } else { + 1.0 + }; + let scale_y = if disp_h > 0.0 { + native_h as f64 / disp_h + } else { + 1.0 }; + let half_native = ((ww * scale_x).max(wh * scale_y) / 2.0).ceil() as u32 + 16; + let max_half = (native_w.max(native_h) / 2).max(64); + ComputerUseScreenshotParams { + crop_center: Some(ScreenshotCropCenter { x: cx, y: cy }), + navigate_quadrant: None, + reset_navigation: false, + point_crop_half_extent_native: Some(half_native.clamp(64, max_half)), + implicit_confirmation_center: None, + crop_to_focused_window: false, + } + } else { + ComputerUseScreenshotParams::default() + }; - if pending_verify && recommended_next_action.is_none() { + { + let mut s = self + .state + .lock() + .map_err(|e| BitFunError::tool(format!("lock: {}", e)))?; + s.screenshot_cache = Some(ScreenshotCacheEntry { + rgba: rgba.clone(), + screen, + capture_time: Instant::now(), + }); + } + + let (shot, map, nav_out) = tokio::task::spawn_blocking(move || { + Self::screenshot_sync_tool_with_capture(params, None, rgba, screen, None, false) + }) + .await + .map_err(|e| BitFunError::tool(e.to_string()))??; + let refinement = Self::refinement_from_shot(&shot); + { + let mut s = self + .state + .lock() + .map_err(|e| BitFunError::tool(format!("lock: {}", e)))?; + s.transition_after_screenshot(map, refinement, nav_out); + s.app_pointer_maps.insert(pid, map); + if let Some(id) = shot.screenshot_id.clone() { + s.screenshot_pointer_maps.insert(id, map); + } + } + Ok(shot) + } + + /// Internal `get_app_state` that lets callers opt out of the focused-window + /// screenshot. The public trait method always passes `capture_screenshot=true` + /// (Codex parity). Internal re-snapshots from `app_click` / `app_type_text` / + /// `app_scroll` / `app_key_chord` pass `false` to avoid a redundant capture + /// — the **outer** call (e.g. the one returned to the model) gets the image. + pub(crate) async fn get_app_state_inner( + &self, + app: AppSelector, + max_depth: u32, + focus_window_only: bool, + capture_screenshot: bool, + ) -> BitFunResult { + #[cfg(target_os = "macos")] + { + // Pre-flight: without Accessibility trust macOS silently truncates + // the AX subtree to the top-level window/container (~7 nodes for + // a Tauri WebView app), with no exception. The agent then has no + // actionable widgets to act on. Fail fast with a structured + // `[PERMISSION_DENIED]` error so the model can surface the issue + // (and the host's startup prompt is what produces the dialog). + if !macos::ax_trusted() { + // Re-trigger the system prompt in case the user dismissed it + // earlier — without this they have no way back to the dialog + // short of digging through System Settings manually. + macos::request_ax_prompt(); + return Err(BitFunError::tool( + "[PERMISSION_DENIED] macOS Accessibility permission not granted to BitFun. \ + The system has been asked to surface the permission dialog (System Settings → \ + Privacy & Security → Accessibility → enable BitFun). After granting, retry \ + `desktop.get_app_state` and the AX tree will include all WebView subtree nodes." + .to_string(), + )); + } + let pid = resolve_pid_macos(self, &app).await?; + let mut snap = tokio::task::spawn_blocking(move || { + // Wrap in @try/@catch — AX APIs can throw NSException for + // sandboxed / partially-loaded / dying processes, and an + // unwound foreign exception aborts the whole bitfun process + // (`Rust cannot catch foreign exceptions, aborting`). + macos::catch_objc(|| { + crate::computer_use::macos_ax_dump::dump_app_ax( + pid, + crate::computer_use::macos_ax_dump::DumpOpts { + max_depth, + focus_window_only, + ..Default::default() + }, + ) + }) + }) + .await + .map_err(|e| BitFunError::tool(e.to_string()))??; + + // Auto-attach focused-window screenshot. Failures are non-fatal — + // worst case the model still has the AX tree. + if capture_screenshot { + let started = std::time::Instant::now(); + match self.screenshot_for_app_pid(pid).await { + Ok(shot) => { + debug!( + "computer_use.app_state: attached screenshot ({}x{} jpeg, {} bytes, {}ms)", + shot.image_width, + shot.image_height, + shot.bytes.len(), + started.elapsed().as_millis() + ); + snap.screenshot = Some(shot); + } + Err(e) => { + debug!( + "computer_use.app_state: screenshot capture failed (non-fatal): {}", + e + ); + } + } + } + Ok(snap) + } + #[cfg(not(target_os = "macos"))] + { + let _ = (app, max_depth, focus_window_only, capture_screenshot); + Err(BitFunError::tool( + "get_app_state is only available on macOS in this build".to_string(), + )) + } + } +} + +#[cfg(target_os = "macos")] +fn require_macos_background_input() -> BitFunResult<()> { + if crate::computer_use::macos_bg_input::supports_background_input() { + return Ok(()); + } + Err(BitFunError::tool( + "[BACKGROUND_INPUT_UNAVAILABLE] macOS Accessibility permission is required for background app input. Grant BitFun in System Settings -> Privacy & Security -> Accessibility, then retry desktop.meta/capabilities or desktop.get_app_state.".to_string(), + )) +} + +#[async_trait] +impl ComputerUseHost for DesktopComputerUseHost { + async fn permission_snapshot(&self) -> BitFunResult { + Ok(tokio::task::spawn_blocking(Self::permission_sync) + .await + .map_err(|e| BitFunError::tool(e.to_string()))?) + } + + fn computer_use_interaction_state(&self) -> ComputerUseInteractionState { + let (last_ref, click_needs_fresh, pending_verify, last_mutation, preferred_display_id) = { + let s = self.state.lock().unwrap(); + ( + s.last_shot_refinement, + s.click_needs_fresh_screenshot, + s.pending_verify_screenshot, + s.last_mutation_kind.clone(), + s.preferred_display_id, + ) + }; + + let (mouse_x, mouse_y) = Self::current_mouse_position(); + let displays = Self::enumerate_displays(preferred_display_id, mouse_x, mouse_y); + let active_display_id = preferred_display_id.or_else(|| { + displays + .iter() + .find(|d| d.has_pointer) + .map(|d| d.display_id) + .or_else(|| displays.iter().find(|d| d.is_primary).map(|d| d.display_id)) + }); + + let (click_ready, screenshot_kind, mut recommended_next_action) = + match last_ref { + Some(ComputerUseScreenshotRefinement::RegionAroundPoint { .. }) => ( + !click_needs_fresh, + Some(ComputerUseInteractionScreenshotKind::RegionCrop), + None, + ), + Some(ComputerUseScreenshotRefinement::QuadrantNavigation { + click_ready, .. + }) if click_ready => ( + !click_needs_fresh, + Some(ComputerUseInteractionScreenshotKind::QuadrantTerminal), + None, + ), + Some(ComputerUseScreenshotRefinement::QuadrantNavigation { .. }) => ( + false, + Some(ComputerUseInteractionScreenshotKind::QuadrantDrill), + Some("screenshot_navigate_quadrant_until_click_ready".to_string()), + ), + Some(ComputerUseScreenshotRefinement::FullDisplay) => ( + !click_needs_fresh, + Some(ComputerUseInteractionScreenshotKind::FullDisplay), + if click_needs_fresh { + Some("screenshot".to_string()) + } else { + None + }, + ), + None => (false, None, Some("screenshot".to_string())), + }; + + if pending_verify && recommended_next_action.is_none() { recommended_next_action = Some("screenshot".to_string()); } @@ -2371,6 +2869,9 @@ impl ComputerUseHost for DesktopComputerUseHost { .lock() .map_err(|e| BitFunError::tool(format!("lock: {}", e)))?; s.transition_after_screenshot(map, refinement, nav_out); + if let Some(id) = shot.screenshot_id.clone() { + s.screenshot_pointer_maps.insert(id, map); + } } Ok(shot) @@ -2448,7 +2949,7 @@ impl ComputerUseHost for DesktopComputerUseHost { // an empty match list instead of aborting the runtime. #[cfg(target_os = "macos")] { - macos::catch_objc(|| super::screen_ocr::find_text_matches(&shot, &query)) + macos::catch_objc_local(|| super::screen_ocr::find_text_matches(&shot, &query)) } #[cfg(not(target_os = "macos"))] { @@ -3149,4 +3650,1706 @@ tell application "System Events" to get unix id of first process whose frontmost fn focused_display_id(&self) -> Option { self.state.lock().ok().and_then(|s| s.preferred_display_id) } + + // ── Codex-style AX-first desktop automation ───────────────────────── + // + // These override the trait defaults (which return "not available") + // with real macOS implementations on macOS, and keep the defaults on + // other platforms via cfg-gating. + + fn supports_background_input(&self) -> bool { + #[cfg(target_os = "macos")] + { + crate::computer_use::macos_bg_input::supports_background_input() + } + #[cfg(not(target_os = "macos"))] + { + false + } + } + + fn supports_ax_tree(&self) -> bool { + #[cfg(target_os = "macos")] + { + true + } + #[cfg(not(target_os = "macos"))] + { + false + } + } + + async fn list_apps(&self, include_hidden: bool) -> BitFunResult> { + #[cfg(target_os = "macos")] + { + tokio::task::spawn_blocking(move || { + crate::computer_use::macos_list_apps::list_running_apps(include_hidden) + }) + .await + .map_err(|e| BitFunError::tool(e.to_string()))? + } + #[cfg(not(target_os = "macos"))] + { + let _ = include_hidden; + Ok(Vec::new()) + } + } + + async fn get_app_state( + &self, + app: AppSelector, + max_depth: u32, + focus_window_only: bool, + ) -> BitFunResult { + // Public path: always auto-attach a focused-window screenshot so the + // model is never blind on Canvas / WebView / WebGL surfaces that the + // AX tree can't describe (Codex parity — its `get_app_state` is the + // single "eyes" of the desktop loop). + self.get_app_state_inner(app, max_depth, focus_window_only, true) + .await + } + + async fn app_click(&self, params: AppClickParams) -> BitFunResult { + #[cfg(target_os = "macos")] + { + let pid = resolve_pid_macos(self, ¶ms.app).await?; + let self_pid = std::process::id() as i32; + info!( + target: "computer_use::app_click", + "app_click.enter pid={} self_pid={} same_process={} target={:?} button={} click_count={} modifier_keys={:?}", + pid, + self_pid, + pid == self_pid, + params.target, + params.mouse_button, + params.click_count, + params.modifier_keys + ); + // Try AX press path when the target is a node idx and the cache + // still holds a live ref; otherwise inject background events at + // the resolved global coordinate. + let ax_ok = match ¶ms.target { + ClickTarget::NodeIdx { idx } => { + let idx = *idx; + // Run AX lookup + AXPress under @try/@catch on a blocking + // thread; either a missing ref or a thrown NSException + // simply degrades to the bg_click fallback below. + tokio::task::spawn_blocking(move || { + macos::catch_objc(|| { + Ok( + if let Some(r) = + crate::computer_use::macos_ax_dump::cached_ref_loose(pid, idx) + { + matches!( + crate::computer_use::macos_ax_write::try_ax_press(r), + crate::computer_use::macos_ax_write::AxWriteOutcome::Ok + ) + } else { + false + }, + ) + }) + .unwrap_or(false) + }) + .await + .unwrap_or(false) + } + ClickTarget::ScreenXy { .. } + | ClickTarget::ImageXy { .. } + | ClickTarget::ImageGrid { .. } + | ClickTarget::VisualGrid { .. } + | ClickTarget::OcrText { .. } => false, + }; + if !ax_ok { + require_macos_background_input()?; + let (x, y): (f64, f64) = match ¶ms.target { + ClickTarget::ScreenXy { x, y } => (*x, *y), + ClickTarget::ImageXy { + x, + y, + screenshot_id, + } => self.map_app_image_coords_to_pointer_f64( + pid, + *x, + *y, + screenshot_id.as_deref(), + )?, + ClickTarget::ImageGrid { screenshot_id, .. } => { + let (ix, iy) = + Self::image_grid_target_to_xy(¶ms.target)?.ok_or_else(|| { + BitFunError::tool("invalid image_grid target".to_string()) + })?; + self.map_app_image_coords_to_pointer_f64( + pid, + ix, + iy, + screenshot_id.as_deref(), + )? + } + ClickTarget::VisualGrid { + rows, + cols, + row, + col, + intersections, + wait_ms_after_detection, + } => { + let shot = self.screenshot_for_app_pid(pid).await?; + let (x0, y0, width, height) = + detect_regular_grid_rect_from_screenshot(&shot, *rows, *cols)?; + let target = ClickTarget::ImageGrid { + x0, + y0, + width, + height, + rows: *rows, + cols: *cols, + row: *row, + col: *col, + intersections: *intersections, + screenshot_id: shot.screenshot_id.clone(), + }; + let (ix, iy) = Self::image_grid_target_to_xy(&target)?.ok_or_else(|| { + BitFunError::tool("invalid detected visual_grid target".to_string()) + })?; + if let Some(wait) = wait_ms_after_detection { + if *wait > 0 { + tokio::time::sleep(Duration::from_millis(*wait as u64)).await; + } + } + self.map_app_image_coords_to_pointer_f64( + pid, + ix, + iy, + shot.screenshot_id.as_deref(), + )? + } + ClickTarget::NodeIdx { idx } => { + // Best-effort: re-snapshot to read the node's frame. + // Skip the screenshot — this snapshot is internal-only; + // the post-click re-snapshot below is the one returned + // to the model and carries the visual evidence. + let snap = self + .get_app_state_inner(params.app.clone(), 32, false, false) + .await?; + let node = snap.nodes.iter().find(|n| n.idx == *idx).ok_or_else(|| { + BitFunError::tool(format!( + "AX_NODE_STALE: idx={} no longer present in app state", + idx + )) + })?; + // Refuse to fall back to (0,0) on the desktop — + // that would silently click the menu bar / Finder + // icon. The caller must re-snapshot to acquire a + // node with a real on-screen frame. + let (fx, fy, fw, fh) = node.frame_global.ok_or_else(|| { + BitFunError::tool(format!( + "AX_NODE_STALE: idx={} has no AXFrame (likely off-screen or window minimised)", + idx + )) + })?; + if fw <= 0.0 || fh <= 0.0 { + return Err(BitFunError::tool(format!( + "AX_NODE_STALE: idx={} has zero-size frame ({}x{})", + idx, fw, fh + ))); + } + (fx + fw / 2.0, fy + fh / 2.0) + } + ClickTarget::OcrText { needle } => { + // Codex parity: when the AX tree doesn't expose the + // target widget (Canvas, WebGL, custom-drawn cell), + // fall back to OCR-on-screenshot. We screenshot the + // whole screen rather than just the target window + // because window-relative regions need extra plumbing + // and the matcher already filters by confidence. + let matches = self.ocr_find_text_matches(needle, None).await?; + let best = matches.into_iter().max_by(|a, b| { + a.confidence + .partial_cmp(&b.confidence) + .unwrap_or(std::cmp::Ordering::Equal) + }); + let m = best.ok_or_else(|| { + BitFunError::tool(format!( + "NOT_FOUND: no OCR match for needle {:?}", + needle + )) + })?; + (m.center_x, m.center_y) + } + }; + let mods: Vec = params + .modifier_keys + .iter() + .filter_map(|m| crate::computer_use::macos_bg_input::BgModifier::from_str(m)) + .collect(); + let btn = match params.mouse_button.as_str() { + "right" => crate::computer_use::macos_bg_input::BgMouseButton::Right, + "middle" => crate::computer_use::macos_bg_input::BgMouseButton::Middle, + _ => crate::computer_use::macos_bg_input::BgMouseButton::Left, + }; + let cnt = params.click_count.max(1) as u32; + info!( + target: "computer_use::app_click", + "app_click.bg_dispatch pid={} self_pid={} same_process={} resolved_x={:.2} resolved_y={:.2} click_count={}", + pid, self_pid, pid == self_pid, x, y, cnt + ); + + // Capture pre-click digest so we can detect "click delivered + // but UI did not change" and apply a foreground fallback when + // the target lives in our own process (the most common cause + // of `bg_click → WKWebView no-op` in single-process Tauri). + let pre_digest_opt = match self + .get_app_state_inner(params.app.clone(), 0, false, false) + .await + { + Ok(s) => Some(s.digest), + Err(e) => { + debug!( + target: "computer_use::app_click", + "pre_digest_unavailable error={}", + e + ); + None + } + }; + + // Best-effort foreground activation — required for WKWebView + // and many Cocoa hit-testers to actually deliver our + // synthetic events. No-op (returns false) when the pid is + // already frontmost. + let activate_pid = pid; + let _ = tokio::task::spawn_blocking(move || { + macos::catch_objc(|| { + crate::computer_use::macos_bg_input::activate_pid_macos(activate_pid) + }) + }) + .await; + + let mods_for_bg = mods.clone(); + tokio::task::spawn_blocking(move || { + macos::catch_objc(|| { + crate::computer_use::macos_bg_input::bg_click( + pid, + (x, y), + btn, + cnt, + &mods_for_bg, + ) + }) + }) + .await + .map_err(|e| BitFunError::tool(e.to_string()))??; + + // Same-process fallback: if `bg_click` left the digest + // unchanged AND the target is our own process (bitfun-desktop + // hosting an embedded mini-app WebView), retry with the + // foreground click path. This trades a momentary cursor + // movement for actually landing the click in the WebView. + if pid == self_pid { + let settle = params.wait_ms_after.unwrap_or(120).min(5_000); + tokio::time::sleep(Duration::from_millis(settle.max(80) as u64)).await; + let post_digest_opt = self + .get_app_state_inner(params.app.clone(), 0, false, false) + .await + .ok() + .map(|s| s.digest); + let unchanged = + matches!((&pre_digest_opt, &post_digest_opt), (Some(a), Some(b)) if a == b); + if unchanged { + warn!( + target: "computer_use::app_click", + "bg_click_no_effect_self_pid_falling_back_to_foreground pid={} x={:.2} y={:.2} digest={:?}", + pid, x, y, post_digest_opt + ); + // Foreground fallback uses the user's real cursor + + // synthetic enigo click so the WKWebView's hit-test + // path is identical to a human click. + let btn_str = match btn { + crate::computer_use::macos_bg_input::BgMouseButton::Right => "right", + crate::computer_use::macos_bg_input::BgMouseButton::Middle => "middle", + _ => "left", + }; + self.mouse_move_global_f64(x, y).await?; + for _ in 0..cnt { + self.mouse_click_authoritative(btn_str).await?; + } + } + } + } + let settle_ms = params.wait_ms_after.unwrap_or(120).min(5_000); + if settle_ms > 0 { + tokio::time::sleep(Duration::from_millis(settle_ms as u64)).await; + } + // Re-snapshot so the caller can see the new state + new digest. + self.get_app_state(params.app, 32, false).await + } + #[cfg(not(target_os = "macos"))] + { + let _ = params; + Err(BitFunError::tool( + "app_click is only available on macOS in this build".to_string(), + )) + } + } + + async fn app_type_text( + &self, + app: AppSelector, + text: &str, + focus: Option, + ) -> BitFunResult { + #[cfg(target_os = "macos")] + { + let pid = resolve_pid_macos(self, &app).await?; + // If a focus target is provided, click it first to give focus. + if let Some(target) = focus { + let click = AppClickParams { + app: app.clone(), + target, + click_count: 1, + mouse_button: "left".to_string(), + modifier_keys: vec![], + wait_ms_after: None, + }; + let _ = self.app_click(click).await?; + } + require_macos_background_input()?; + info!( + target: "computer_use::app_type_text", + "app_type_text.bg_dispatch pid={} char_count={}", + pid, + text.chars().count() + ); + let activate_pid = pid; + let _ = tokio::task::spawn_blocking(move || { + macos::catch_objc(|| { + crate::computer_use::macos_bg_input::activate_pid_macos(activate_pid) + }) + }) + .await; + let txt = text.to_string(); + tokio::task::spawn_blocking(move || { + macos::catch_objc(|| crate::computer_use::macos_bg_input::bg_type_text(pid, &txt)) + }) + .await + .map_err(|e| BitFunError::tool(e.to_string()))??; + self.get_app_state(app, 32, false).await + } + #[cfg(not(target_os = "macos"))] + { + let _ = (app, text, focus); + Err(BitFunError::tool( + "app_type_text is only available on macOS in this build".to_string(), + )) + } + } + + async fn app_scroll( + &self, + app: AppSelector, + focus: Option, + dx: i32, + dy: i32, + ) -> BitFunResult { + #[cfg(target_os = "macos")] + { + let pid = resolve_pid_macos(self, &app).await?; + if let Some(target) = focus { + let click = AppClickParams { + app: app.clone(), + target, + click_count: 1, + mouse_button: "left".to_string(), + modifier_keys: vec![], + wait_ms_after: None, + }; + let _ = self.app_click(click).await?; + } + require_macos_background_input()?; + tokio::task::spawn_blocking(move || { + macos::catch_objc(|| crate::computer_use::macos_bg_input::bg_scroll(pid, dx, dy)) + }) + .await + .map_err(|e| BitFunError::tool(e.to_string()))??; + self.get_app_state(app, 32, false).await + } + #[cfg(not(target_os = "macos"))] + { + let _ = (app, focus, dx, dy); + Err(BitFunError::tool( + "app_scroll is only available on macOS in this build".to_string(), + )) + } + } + + async fn app_key_chord( + &self, + app: AppSelector, + keys: Vec, + focus_idx: Option, + ) -> BitFunResult { + #[cfg(target_os = "macos")] + { + let pid = resolve_pid_macos(self, &app).await?; + if let Some(idx) = focus_idx { + let click = AppClickParams { + app: app.clone(), + target: ClickTarget::NodeIdx { idx }, + click_count: 1, + mouse_button: "left".to_string(), + modifier_keys: vec![], + wait_ms_after: None, + }; + let _ = self.app_click(click).await?; + } + require_macos_background_input()?; + tokio::task::spawn_blocking(move || -> BitFunResult<()> { + macos::catch_objc(|| { + let (mods, kc) = + crate::computer_use::macos_bg_input::parse_key_sequence(&keys)?; + crate::computer_use::macos_bg_input::bg_key_chord(pid, &mods, kc)?; + Ok(()) + }) + }) + .await + .map_err(|e| BitFunError::tool(e.to_string()))??; + self.get_app_state(app, 32, false).await + } + #[cfg(not(target_os = "macos"))] + { + let _ = (app, keys, focus_idx); + Err(BitFunError::tool( + "app_key_chord is only available on macOS in this build".to_string(), + )) + } + } + + async fn app_wait_for( + &self, + app: AppSelector, + pred: AppWaitPredicate, + timeout_ms: u32, + poll_ms: u32, + ) -> BitFunResult { + #[cfg(target_os = "macos")] + { + let deadline = Instant::now() + Duration::from_millis(timeout_ms as u64); + let poll = Duration::from_millis(poll_ms.max(50) as u64); + // Polling loop — skip the screenshot per iteration to keep + // poll latency tight; the snapshot we ultimately return gets + // an auto-attached screenshot below. + let baseline = self + .get_app_state_inner(app.clone(), 32, false, false) + .await?; + loop { + let snap = self + .get_app_state_inner(app.clone(), 32, false, false) + .await?; + let ok = match &pred { + AppWaitPredicate::DigestChanged { prev_digest } => { + snap.digest != *prev_digest && snap.digest != baseline.digest + } + AppWaitPredicate::TitleContains { needle } => snap + .window_title + .as_deref() + .map(|t| t.contains(needle.as_str())) + .unwrap_or(false), + AppWaitPredicate::RoleEnabled { role } => snap + .nodes + .iter() + .any(|n| n.role.as_str() == role && n.enabled), + AppWaitPredicate::NodeEnabled { idx } => snap + .nodes + .iter() + .find(|n| n.idx == *idx) + .map(|n| n.enabled) + .unwrap_or(false), + }; + if ok || Instant::now() >= deadline { + // Final returned snap — auto-attach screenshot for parity + // with the rest of the `app_*` family. + let mut snap = snap; + if let Ok(pid) = resolve_pid_macos(self, &app).await { + if let Ok(shot) = self.screenshot_for_app_pid(pid).await { + snap.screenshot = Some(shot); + } + } + if snap.screenshot.is_none() { + if let Ok(shot) = self.screenshot_peek_full_display().await { + snap.screenshot = Some(shot); + } + } + return Ok(snap); + } + tokio::time::sleep(poll).await; + } + } + #[cfg(not(target_os = "macos"))] + { + let _ = (app, pred, timeout_ms, poll_ms); + Err(BitFunError::tool( + "app_wait_for is only available on macOS in this build".to_string(), + )) + } + } + + fn supports_interactive_view(&self) -> bool { + cfg!(target_os = "macos") + } + + fn supports_visual_mark_view(&self) -> bool { + cfg!(target_os = "macos") + } + + async fn build_interactive_view( + &self, + app: AppSelector, + opts: InteractiveViewOpts, + ) -> BitFunResult { + #[cfg(target_os = "macos")] + { + let pid = resolve_pid_macos(self, &app).await?; + let snap = self + .get_app_state_inner(app.clone(), 64, opts.focus_window_only, true) + .await?; + let max_elements = opts + .max_elements + .map(|n| n as usize) + .unwrap_or(80) + .clamp(1, 200); + let filter_opts = crate::computer_use::interactive_filter::FilterOpts { + max_elements, + clip_to_image_bounds: opts.focus_window_only, + }; + let elements = crate::computer_use::interactive_filter::build_interactive_elements( + &snap.nodes, + snap.screenshot.as_ref(), + &filter_opts, + ); + let tree_text = if opts.include_tree_text { + crate::computer_use::interactive_filter::render_element_tree_text(&elements) + } else { + String::new() + }; + let digest = compute_interactive_view_digest(&elements); + + let mut screenshot_out: Option = None; + if opts.annotate_screenshot { + if let Some(shot) = snap.screenshot.as_ref() { + match crate::computer_use::som_overlay::render_overlay( + &shot.bytes, + &elements, + Some(80), + ) { + Ok(jpeg) => { + let mut out = shot.clone(); + out.bytes = jpeg; + out.mime_type = "image/jpeg".to_string(); + screenshot_out = Some(out); + } + Err(e) => { + warn!( + target: "computer_use::interactive_view", + "som_overlay render failed (non-fatal): {}", + e + ); + screenshot_out = Some(shot.clone()); + } + } + } + } else { + screenshot_out = snap.screenshot.clone(); + } + + let captured_at_ms = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map(|d| d.as_millis() as u64) + .unwrap_or_default(); + + let view = InteractiveView { + app: snap.app.clone(), + window_title: snap.window_title.clone(), + elements: elements.clone(), + tree_text, + digest: digest.clone(), + captured_at_ms, + screenshot: screenshot_out, + loop_warning: snap.loop_warning.clone(), + }; + + // Cache for subsequent `interactive_*` calls. + { + let mut s = self + .state + .lock() + .map_err(|e| BitFunError::tool(format!("lock: {}", e)))?; + s.interactive_view_cache.insert( + pid, + CachedInteractiveView { + digest: digest.clone(), + elements, + }, + ); + } + Ok(view) + } + #[cfg(not(target_os = "macos"))] + { + let _ = (app, opts); + Err(BitFunError::tool( + "build_interactive_view is only available on macOS in this build".to_string(), + )) + } + } + + async fn interactive_click( + &self, + app: AppSelector, + params: InteractiveClickParams, + ) -> BitFunResult { + #[cfg(target_os = "macos")] + { + // Resolve `i → node_idx` against the cached interactive view. + // On `STALE_INTERACTIVE_VIEW` we transparently rebuild the + // view ONCE and retry — this turns the most common UI-changed + // failure into an internal recovery instead of a hard error + // the model has to handle. Idempotency is preserved by + // capping at one rebuild + one retry. + let mut auto_rebuilt = false; + let node_idx = match self + .resolve_interactive_index(&app, params.i, params.before_view_digest.as_deref()) + .await + { + Ok(idx) => idx, + Err(err) if is_stale_interactive_view_error(&err) => { + warn!( + target: "computer_use::interactive_view", + "interactive_click: STALE view detected, rebuilding once and retrying (i={}): {}", + params.i, err + ); + let rebuilt = self + .build_interactive_view(app.clone(), InteractiveViewOpts::default()) + .await?; + if rebuilt.elements.iter().any(|e| e.i == params.i) { + auto_rebuilt = true; + // Use the rebuilt view's digest, not the stale one + // the caller passed in. + self.resolve_interactive_index(&app, params.i, Some(&rebuilt.digest)) + .await? + } else { + return Err(BitFunError::tool(format!( + "INTERACTIVE_INDEX_OUT_OF_RANGE: i={} not in rebuilt view (len={}); the UI has changed under you, re-call `build_interactive_view` and pick a fresh `i`", + params.i, + rebuilt.elements.len() + ))); + } + } + Err(other) => return Err(other), + }; + + // Look up the cached element's image-pixel center as a + // pointer fallback. Always available when `frame_image` was + // populated at view-build time; covers Electron / Canvas / + // custom-drawn widgets that AXPress can't dispatch into. + let pointer_fallback_image_xy: Option<(i32, i32)> = + self.cached_interactive_image_center(&app, params.i).await; + + // Primary path: AX-targeted click via `app_click`. On + // failure, fall back to a pointer click at the element's + // image-pixel center if we have one. + let click_res = self + .app_click(AppClickParams { + app: app.clone(), + target: ClickTarget::NodeIdx { idx: node_idx }, + click_count: params.click_count.max(1), + mouse_button: params.mouse_button.clone(), + modifier_keys: params.modifier_keys.clone(), + wait_ms_after: params.wait_ms_after, + }) + .await; + + let (snapshot, fallback_used) = match click_res { + Ok(s) => (s, false), + Err(e) if pointer_fallback_image_xy.is_some() => { + let (ix, iy) = pointer_fallback_image_xy.unwrap(); + warn!( + target: "computer_use::interactive_view", + "interactive_click: AX path failed, falling back to image_xy=({},{}): {}", + ix, iy, e + ); + let s = self + .app_click(AppClickParams { + app: app.clone(), + target: ClickTarget::ImageXy { + x: ix, + y: iy, + screenshot_id: None, + }, + click_count: params.click_count.max(1), + mouse_button: params.mouse_button.clone(), + modifier_keys: params.modifier_keys.clone(), + wait_ms_after: params.wait_ms_after, + }) + .await?; + (s, true) + } + Err(e) => return Err(e), + }; + + let view = if params.return_view { + Some( + self.build_interactive_view(app, InteractiveViewOpts::default()) + .await?, + ) + } else { + None + }; + let mut note = format!("index_resolved_via_node_idx({})", node_idx); + if auto_rebuilt { + note.push_str(",auto_rebuilt_view_after_stale"); + } + if fallback_used { + note.push_str(",fallback_image_xy"); + } + Ok(InteractiveActionResult { + snapshot, + view, + execution_note: Some(note), + }) + } + #[cfg(not(target_os = "macos"))] + { + let _ = (app, params); + Err(BitFunError::tool( + "interactive_click is only available on macOS in this build".to_string(), + )) + } + } + + async fn build_visual_mark_view( + &self, + app: AppSelector, + opts: VisualMarkViewOpts, + ) -> BitFunResult { + #[cfg(target_os = "macos")] + { + let pid = resolve_pid_macos(self, &app).await?; + let mut snap = self + .get_app_state_inner(app.clone(), 16, true, true) + .await?; + if snap.screenshot.is_none() { + if let Ok(shot) = self.screenshot_for_app_pid(pid).await { + snap.screenshot = Some(shot); + } + } + let shot = snap.screenshot.as_ref().ok_or_else(|| { + BitFunError::tool( + "build_visual_mark_view: app screenshot unavailable; grant Screen Recording permission and retry".to_string(), + ) + })?; + + let marks = build_regular_visual_marks(shot, &opts)?; + let digest = compute_visual_mark_view_digest(&marks, shot.screenshot_id.as_deref()); + + let mut screenshot_out: Option = Some(shot.clone()); + if opts.include_grid && !marks.is_empty() { + let overlay_elements = visual_marks_to_overlay_elements(&marks); + match crate::computer_use::som_overlay::render_overlay( + &shot.bytes, + &overlay_elements, + Some(82), + ) { + Ok(jpeg) => { + let mut out = shot.clone(); + out.bytes = jpeg; + out.mime_type = "image/jpeg".to_string(); + screenshot_out = Some(out); + } + Err(e) => { + warn!( + target: "computer_use::visual_mark_view", + "visual mark overlay render failed (non-fatal): {}", + e + ); + } + } + } + + let captured_at_ms = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map(|d| d.as_millis() as u64) + .unwrap_or_default(); + let view = VisualMarkView { + app: snap.app.clone(), + window_title: snap.window_title.clone(), + marks: marks.clone(), + digest: digest.clone(), + captured_at_ms, + screenshot: screenshot_out, + }; + { + let mut s = self + .state + .lock() + .map_err(|e| BitFunError::tool(format!("lock: {}", e)))?; + s.visual_mark_cache.insert( + pid, + CachedVisualMarkView { + digest, + marks, + screenshot_id: shot.screenshot_id.clone(), + }, + ); + } + Ok(view) + } + #[cfg(not(target_os = "macos"))] + { + let _ = (app, opts); + Err(BitFunError::tool( + "build_visual_mark_view is only available on macOS in this build".to_string(), + )) + } + } + + async fn visual_click( + &self, + app: AppSelector, + params: VisualClickParams, + ) -> BitFunResult { + #[cfg(target_os = "macos")] + { + let mut auto_rebuilt = false; + let mark = match self + .resolve_visual_mark(&app, params.i, params.before_view_digest.as_deref()) + .await + { + Ok(mark) => mark, + Err(err) if is_stale_visual_mark_view_error(&err) => { + warn!( + target: "computer_use::visual_mark_view", + "visual_click: STALE visual mark view detected, rebuilding once and retrying (i={}): {}", + params.i, err + ); + let rebuilt = self + .build_visual_mark_view(app.clone(), VisualMarkViewOpts::default()) + .await?; + let Some(mark) = rebuilt.marks.iter().find(|m| m.i == params.i).cloned() else { + return Err(BitFunError::tool(format!( + "VISUAL_INDEX_OUT_OF_RANGE: i={} not in rebuilt view (len={}); re-call `build_visual_mark_view` and pick a fresh `i`", + params.i, + rebuilt.marks.len() + ))); + }; + auto_rebuilt = true; + mark + } + Err(other) => return Err(other), + }; + + let screenshot_id = { + let pid = resolve_pid_macos(self, &app).await?; + let s = self + .state + .lock() + .map_err(|e| BitFunError::tool(format!("lock: {}", e)))?; + s.visual_mark_cache + .get(&pid) + .and_then(|cached| cached.screenshot_id.clone()) + }; + + let snapshot = self + .app_click(AppClickParams { + app: app.clone(), + target: ClickTarget::ImageXy { + x: mark.x, + y: mark.y, + screenshot_id, + }, + click_count: params.click_count.max(1), + mouse_button: params.mouse_button.clone(), + modifier_keys: params.modifier_keys.clone(), + wait_ms_after: params.wait_ms_after, + }) + .await?; + + let view = if params.return_view { + Some( + self.build_visual_mark_view(app, VisualMarkViewOpts::default()) + .await?, + ) + } else { + None + }; + let mut note = format!("visual_mark_image_xy({},{})", mark.x, mark.y); + if auto_rebuilt { + note.push_str(",auto_rebuilt_view_after_stale"); + } + Ok(VisualActionResult { + snapshot, + view, + execution_note: Some(note), + }) + } + #[cfg(not(target_os = "macos"))] + { + let _ = (app, params); + Err(BitFunError::tool( + "visual_click is only available on macOS in this build".to_string(), + )) + } + } + + async fn interactive_type_text( + &self, + app: AppSelector, + params: InteractiveTypeTextParams, + ) -> BitFunResult { + #[cfg(target_os = "macos")] + { + let focus = if let Some(i) = params.i { + let node_idx = self + .resolve_interactive_index(&app, i, params.before_view_digest.as_deref()) + .await?; + Some(ClickTarget::NodeIdx { idx: node_idx }) + } else { + None + }; + + if params.clear_first { + if let Some(target) = focus.clone() { + let _ = self + .app_click(AppClickParams { + app: app.clone(), + target, + click_count: 1, + mouse_button: "left".to_string(), + modifier_keys: vec![], + wait_ms_after: Some(60), + }) + .await?; + } + let pid = resolve_pid_macos(self, &app).await?; + tokio::task::spawn_blocking(move || -> BitFunResult<()> { + macos::catch_objc(|| { + let (m1, k1) = crate::computer_use::macos_bg_input::parse_key_sequence(&[ + "cmd".to_string(), + "a".to_string(), + ])?; + crate::computer_use::macos_bg_input::bg_key_chord(pid, &m1, k1)?; + let (m2, k2) = crate::computer_use::macos_bg_input::parse_key_sequence(&[ + "delete".to_string(), + ])?; + crate::computer_use::macos_bg_input::bg_key_chord(pid, &m2, k2)?; + Ok(()) + }) + }) + .await + .map_err(|e| BitFunError::tool(e.to_string()))??; + } + + let snapshot = self.app_type_text(app.clone(), ¶ms.text, focus).await?; + + if params.press_enter_after { + let pid = resolve_pid_macos(self, &app).await?; + tokio::task::spawn_blocking(move || -> BitFunResult<()> { + macos::catch_objc(|| { + let (m, k) = crate::computer_use::macos_bg_input::parse_key_sequence(&[ + "return".to_string(), + ])?; + crate::computer_use::macos_bg_input::bg_key_chord(pid, &m, k)?; + Ok(()) + }) + }) + .await + .map_err(|e| BitFunError::tool(e.to_string()))??; + } + + if let Some(wait) = params.wait_ms_after { + tokio::time::sleep(Duration::from_millis(wait.min(5_000) as u64)).await; + } + + let view = if params.return_view { + Some( + self.build_interactive_view(app, InteractiveViewOpts::default()) + .await?, + ) + } else { + None + }; + Ok(InteractiveActionResult { + snapshot, + view, + execution_note: Some("ax_focus_then_bg_type_text".to_string()), + }) + } + #[cfg(not(target_os = "macos"))] + { + let _ = (app, params); + Err(BitFunError::tool( + "interactive_type_text is only available on macOS in this build".to_string(), + )) + } + } + + async fn interactive_scroll( + &self, + app: AppSelector, + params: InteractiveScrollParams, + ) -> BitFunResult { + #[cfg(target_os = "macos")] + { + let focus = if let Some(i) = params.i { + let node_idx = self + .resolve_interactive_index(&app, i, params.before_view_digest.as_deref()) + .await?; + Some(ClickTarget::NodeIdx { idx: node_idx }) + } else { + None + }; + let snapshot = self + .app_scroll(app.clone(), focus, params.dx, params.dy) + .await?; + if let Some(wait) = params.wait_ms_after { + tokio::time::sleep(Duration::from_millis(wait.min(5_000) as u64)).await; + } + let view = if params.return_view { + Some( + self.build_interactive_view(app, InteractiveViewOpts::default()) + .await?, + ) + } else { + None + }; + Ok(InteractiveActionResult { + snapshot, + view, + execution_note: Some("app_scroll".to_string()), + }) + } + #[cfg(not(target_os = "macos"))] + { + let _ = (app, params); + Err(BitFunError::tool( + "interactive_scroll is only available on macOS in this build".to_string(), + )) + } + } +} + +/// Resolve an `AppSelector` to a concrete `pid` on macOS. Resolution +/// precedence (Codex parity): `pid > bundle_id > name`. +#[cfg(target_os = "macos")] +async fn resolve_pid_macos(host: &DesktopComputerUseHost, app: &AppSelector) -> BitFunResult { + if let Some(pid) = app.pid { + return Ok(pid); + } + let apps = host.list_apps(true).await?; + if let Some(bid) = app.bundle_id.as_deref() { + let needle = bid.to_lowercase(); + if let Some(p) = apps + .iter() + .find(|a| { + a.bundle_id + .as_deref() + .map(|s| s.to_lowercase() == needle) + .unwrap_or(false) + }) + .and_then(|a| a.pid) + { + return Ok(p); + } + } + if let Some(name) = app.name.as_deref() { + let needle = name.to_lowercase(); + // 1) Exact match against the localized application name (what the + // Dock / Spotlight shows, e.g. "BitFun"). + if let Some(p) = apps + .iter() + .find(|a| a.name.to_lowercase() == needle) + .and_then(|a| a.pid) + { + return Ok(p); + } + // 2) Exact match against the bundle id's last segment (e.g. user + // asks for "BitFun" but `list_apps` returned name="bitfun-desktop" + // with bundle_id="ai.bitfun.desktop"). This keeps us aligned with + // Codex, which is robust to "Cursor" vs "com.todesktop....Cursor". + if let Some(p) = apps + .iter() + .find(|a| { + a.bundle_id + .as_deref() + .and_then(|b| b.rsplit('.').next()) + .map(|seg| seg.to_lowercase() == needle) + .unwrap_or(false) + }) + .and_then(|a| a.pid) + { + return Ok(p); + } + // 3) Substring match on either `name` or `bundle_id` (case- + // insensitive). Pick the shortest matching name to avoid + // accidentally targeting "Visual Studio Code Helper (GPU)". + let mut candidates: Vec<&AppInfo> = apps + .iter() + .filter(|a| { + a.name.to_lowercase().contains(&needle) + || a.bundle_id + .as_deref() + .map(|b| b.to_lowercase().contains(&needle)) + .unwrap_or(false) + }) + .collect(); + candidates.sort_by_key(|a| a.name.len()); + if let Some(p) = candidates.first().and_then(|a| a.pid) { + return Ok(p); + } + } + Err(BitFunError::tool(format!("APP_NOT_FOUND: {:?}", app))) +} + +/// Stable lowercase-hex SHA1 over a *layout-only* canonical payload: +/// `i|node_idx|role|subrole|x_bucket,y_bucket,w_bucket,h_bucket`. +/// +/// Deliberately omits `label` (textfield value, focused selection, live +/// counters etc. would otherwise turn every keystroke into a STALE error) +/// and snaps coordinates to an 8-pt grid so a 1-pixel re-layout from a +/// scrollbar appearing / IME bar resizing doesn't invalidate the cached +/// view either. The digest is meant to detect *structural* changes +/// (elements appeared, disappeared, or moved noticeably), not cosmetic +/// noise. +fn compute_interactive_view_digest( + elements: &[bitfun_core::agentic::tools::computer_use_host::InteractiveElement], +) -> String { + use sha1::{Digest, Sha1}; + const BUCKET: f64 = 8.0; + let mut hasher = Sha1::new(); + for e in elements { + let subrole = e.subrole.as_deref().unwrap_or(""); + let (x, y, w, h) = e.frame_global.unwrap_or((0.0, 0.0, 0.0, 0.0)); + let xb = (x / BUCKET).floor() as i64; + let yb = (y / BUCKET).floor() as i64; + let wb = (w / BUCKET).round().max(1.0) as i64; + let hb = (h / BUCKET).round().max(1.0) as i64; + let line = format!( + "{}|{}|{}|{}|{},{},{},{}\n", + e.i, e.node_idx, e.role, subrole, xb, yb, wb, hb, + ); + hasher.update(line.as_bytes()); + } + let bytes = hasher.finalize(); + let mut out = String::with_capacity(bytes.len() * 2); + for b in bytes.iter() { + out.push_str(&format!("{:02x}", b)); + } + out +} + +fn compute_visual_mark_view_digest(marks: &[VisualMark], screenshot_id: Option<&str>) -> String { + use sha1::{Digest, Sha1}; + let mut hasher = Sha1::new(); + hasher.update(screenshot_id.unwrap_or("").as_bytes()); + hasher.update(b"\n"); + for mark in marks { + let frame = mark.frame_image.unwrap_or((0, 0, 0, 0)); + let line = format!( + "{}|{}|{}|{},{},{},{}\n", + mark.i, mark.x, mark.y, frame.0, frame.1, frame.2, frame.3 + ); + hasher.update(line.as_bytes()); + } + let bytes = hasher.finalize(); + let mut out = String::with_capacity(bytes.len() * 2); + for b in bytes.iter() { + out.push_str(&format!("{:02x}", b)); + } + out +} + +fn build_regular_visual_marks( + shot: &ComputerScreenshot, + opts: &VisualMarkViewOpts, +) -> BitFunResult> { + if !opts.include_grid { + return Ok(Vec::new()); + } + + let image_w = shot.image_width.max(1); + let image_h = shot.image_height.max(1); + let (mut x0, mut y0, mut width, mut height) = if let Some(region) = opts.region.as_ref() { + (region.x0, region.y0, region.width, region.height) + } else if let Some(rect) = shot.image_content_rect.as_ref() { + (rect.left, rect.top, rect.width, rect.height) + } else { + (0, 0, image_w, image_h) + }; + + x0 = x0.min(image_w.saturating_sub(1)); + y0 = y0.min(image_h.saturating_sub(1)); + width = width.min(image_w.saturating_sub(x0)).max(1); + height = height.min(image_h.saturating_sub(y0)).max(1); + + let max_points = opts.max_points.unwrap_or(64).clamp(4, 196); + let aspect = (width as f64 / height.max(1) as f64).clamp(0.25, 4.0); + let mut cols = ((max_points as f64 * aspect).sqrt().ceil() as u32).clamp(2, max_points); + let mut rows = ((max_points as f64) / cols as f64).ceil() as u32; + rows = rows.max(2); + while rows.saturating_mul(cols) > max_points && rows > 2 { + rows -= 1; + } + while rows.saturating_mul(cols) > max_points && cols > 2 { + cols -= 1; + } + + let mut marks = Vec::with_capacity(rows.saturating_mul(cols) as usize); + for row in 0..rows { + for col in 0..cols { + if marks.len() >= max_points as usize { + break; + } + let x = x0 as f64 + ((col as f64 + 0.5) * width as f64 / cols as f64); + let y = y0 as f64 + ((row as f64 + 0.5) * height as f64 / rows as f64); + let x = x.round().clamp(0.0, image_w.saturating_sub(1) as f64) as i32; + let y = y.round().clamp(0.0, image_h.saturating_sub(1) as f64) as i32; + let box_size_i32 = if width.min(height) < 180 { 18 } else { 24 }; + let half = box_size_i32 / 2; + let fx = (x - half).max(0) as u32; + let fy = (y - half).max(0) as u32; + let box_size = box_size_i32 as u32; + let fw = box_size.min(image_w.saturating_sub(fx)).max(1); + let fh = box_size.min(image_h.saturating_sub(fy)).max(1); + marks.push(VisualMark { + i: marks.len() as u32, + x, + y, + frame_image: Some((fx, fy, fw, fh)), + label: None, + }); + } + } + + if marks.is_empty() { + return Err(BitFunError::tool( + "build_visual_mark_view: no visual marks generated for the requested region" + .to_string(), + )); + } + Ok(marks) +} + +fn visual_marks_to_overlay_elements( + marks: &[VisualMark], +) -> Vec { + marks + .iter() + .map( + |mark| bitfun_core::agentic::tools::computer_use_host::InteractiveElement { + i: mark.i, + node_idx: mark.i, + role: "VisualMark".to_string(), + subrole: None, + label: mark.label.clone(), + frame_image: mark.frame_image, + frame_global: None, + enabled: true, + focused: false, + ax_actionable: false, + }, + ) + .collect() +} + +fn detect_regular_grid_rect_from_screenshot( + shot: &ComputerScreenshot, + rows: u32, + cols: u32, +) -> BitFunResult<(i32, i32, u32, u32)> { + if rows < 2 || cols < 2 { + return Err(BitFunError::tool( + "visual_grid requires rows and cols >= 2".to_string(), + )); + } + + let img = image::load_from_memory(&shot.bytes) + .map_err(|e| BitFunError::tool(format!("visual_grid: decode screenshot failed: {e}")))? + .to_rgb8(); + let (image_w, image_h) = img.dimensions(); + let (left, top, width, height) = shot + .image_content_rect + .as_ref() + .map(|r| (r.left, r.top, r.width, r.height)) + .unwrap_or((0, 0, image_w, image_h)); + let right = left.saturating_add(width).min(image_w); + let bottom = top.saturating_add(height).min(image_h); + if right <= left + 8 || bottom <= top + 8 { + return Err(BitFunError::tool( + "visual_grid: screenshot content rect is too small".to_string(), + )); + } + + let vertical = projection_darkness(&img, left, top, right, bottom, true); + let horizontal = projection_darkness(&img, left, top, right, bottom, false); + let x_seq = detect_regular_line_sequence(&vertical, cols, left)?; + let y_seq = detect_regular_line_sequence(&horizontal, rows, top)?; + let x0 = *x_seq.first().unwrap_or(&left); + let x1 = *x_seq.last().unwrap_or(&right.saturating_sub(1)); + let y0 = *y_seq.first().unwrap_or(&top); + let y1 = *y_seq.last().unwrap_or(&bottom.saturating_sub(1)); + let w = x1.saturating_sub(x0).saturating_add(1).max(2); + let h = y1.saturating_sub(y0).saturating_add(1).max(2); + + let aspect = w as f64 / h.max(1) as f64; + if !(0.5..=2.0).contains(&aspect) { + return Err(BitFunError::tool(format!( + "visual_grid: detected grid is implausibly non-square (x0={}, y0={}, width={}, height={}, aspect={:.2}); pass image_grid with an explicit rectangle", + x0, y0, w, h, aspect + ))); + } + + Ok((x0 as i32, y0 as i32, w, h)) +} + +fn projection_darkness( + img: &image::RgbImage, + left: u32, + top: u32, + right: u32, + bottom: u32, + vertical: bool, +) -> Vec { + let len = (if vertical { right - left } else { bottom - top }) as usize; + let mut out = vec![0.0; len]; + if vertical { + for x in left..right { + let mut sum = 0.0; + for y in top..bottom { + let p = img.get_pixel(x, y).0; + let gray = 0.299 * p[0] as f64 + 0.587 * p[1] as f64 + 0.114 * p[2] as f64; + sum += (255.0 - gray).max(0.0); + } + out[(x - left) as usize] = sum / (bottom - top).max(1) as f64; + } + } else { + for y in top..bottom { + let mut sum = 0.0; + for x in left..right { + let p = img.get_pixel(x, y).0; + let gray = 0.299 * p[0] as f64 + 0.587 * p[1] as f64 + 0.114 * p[2] as f64; + sum += (255.0 - gray).max(0.0); + } + out[(y - top) as usize] = sum / (right - left).max(1) as f64; + } + } + smooth_projection(&out, 2) +} + +fn smooth_projection(values: &[f64], radius: usize) -> Vec { + if values.is_empty() { + return Vec::new(); + } + let mut out = Vec::with_capacity(values.len()); + for i in 0..values.len() { + let start = i.saturating_sub(radius); + let end = (i + radius + 1).min(values.len()); + let sum: f64 = values[start..end].iter().sum(); + out.push(sum / (end - start).max(1) as f64); + } + out +} + +fn detect_regular_line_sequence( + projection: &[f64], + count: u32, + offset: u32, +) -> BitFunResult> { + if projection.len() < count as usize { + return Err(BitFunError::tool( + "visual_grid: projection is smaller than requested grid count".to_string(), + )); + } + let mut sorted = projection.to_vec(); + sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal)); + let baseline = sorted[sorted.len() / 2]; + let adjusted: Vec = projection + .iter() + .map(|v| (*v - baseline).max(0.0)) + .collect(); + let mut adjusted_sorted = adjusted.clone(); + adjusted_sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal)); + let threshold = adjusted_sorted + [(adjusted_sorted.len() * 95 / 100).min(adjusted_sorted.len().saturating_sub(1))] + .max(1.0); + let mut peaks: Vec = Vec::new(); + let min_gap = ((projection.len() as f64 / count.max(1) as f64) * 0.35).round() as usize; + let mut i = 0usize; + while i < projection.len() { + if adjusted[i] < threshold { + i += 1; + continue; + } + let start = i; + let mut best = i; + let mut best_score = adjusted[i]; + while i < adjusted.len() && adjusted[i] >= threshold { + if adjusted[i] > best_score { + best = i; + best_score = adjusted[i]; + } + i += 1; + } + let end = i.saturating_sub(1); + let center = if best_score <= threshold { + (start + end) / 2 + } else { + best + }; + if let Some(last) = peaks.last_mut() { + if center.saturating_sub(*last) < min_gap.max(2) { + if adjusted[center] > adjusted[*last] { + *last = center; + } + continue; + } + } + peaks.push(center); + } + if peaks.len() < 2 { + if let Some(fallback) = top_regular_positions(&adjusted, count, offset, min_gap.max(2)) { + return Ok(fallback); + } + return Err(BitFunError::tool( + "visual_grid: could not find enough line peaks".to_string(), + )); + } + + let mut best: Option<(f64, Vec)> = None; + let desired = count as usize; + for a_idx in 0..peaks.len() { + for b_idx in (a_idx + 1)..peaks.len() { + let first = peaks[a_idx] as f64; + let last = peaks[b_idx] as f64; + let span = last - first; + if span < desired.saturating_sub(1).max(1) as f64 * 4.0 { + continue; + } + let step = span / desired.saturating_sub(1).max(1) as f64; + let tolerance = (step * 0.18).max(3.0); + let mut positions = Vec::with_capacity(desired); + let mut score = 0.0; + let mut matched = 0usize; + for k in 0..desired { + let expected = first + k as f64 * step; + let nearest = peaks + .iter() + .min_by(|a, b| { + ((**a as f64 - expected).abs()) + .partial_cmp(&((**b as f64 - expected).abs())) + .unwrap_or(std::cmp::Ordering::Equal) + }) + .copied(); + let pos = if let Some(p) = nearest { + if (p as f64 - expected).abs() <= tolerance { + matched += 1; + p as f64 + } else { + expected + } + } else { + expected + }; + let idx = pos.round().clamp(0.0, projection.len().saturating_sub(1) as f64) + as usize; + score += adjusted[idx]; + positions.push(offset + idx as u32); + } + if matched < (desired * 2 / 3).max(2) { + continue; + } + score += matched as f64 * threshold; + score += span * 0.02; + if best.as_ref().map(|(s, _)| score > *s).unwrap_or(true) { + best = Some((score, positions)); + } + } + } + + best.map(|(_, positions)| positions) + .or_else(|| top_regular_positions(&adjusted, count, offset, min_gap.max(2))) + .ok_or_else(|| { + BitFunError::tool( + "visual_grid: no regular grid sequence detected; pass image_grid with an explicit rectangle or build_visual_mark_view to choose a point" + .to_string(), + ) + }) +} + +fn top_regular_positions( + scores: &[f64], + count: u32, + offset: u32, + min_gap: usize, +) -> Option> { + let desired = count as usize; + let mut ranked: Vec = (0..scores.len()).collect(); + ranked.sort_by(|a, b| { + scores[*b] + .partial_cmp(&scores[*a]) + .unwrap_or(std::cmp::Ordering::Equal) + }); + let mut selected: Vec = Vec::with_capacity(desired); + for idx in ranked { + if scores[idx] <= 0.0 { + break; + } + if selected + .iter() + .any(|s| idx.abs_diff(*s) < min_gap.max(2)) + { + continue; + } + selected.push(idx); + if selected.len() == desired { + break; + } + } + if selected.len() < desired { + return None; + } + selected.sort_unstable(); + Some(selected.into_iter().map(|idx| offset + idx as u32).collect()) +} + +/// Returns `true` if the error reported by `resolve_interactive_index` +/// is the recoverable `STALE_INTERACTIVE_VIEW` variant. We match on the +/// error text rather than introducing a typed error enum because every +/// `BitFunError::tool` is already string-based throughout the host +/// surface; adding a new variant would ripple through ~40 callers. +fn is_stale_interactive_view_error(err: &BitFunError) -> bool { + err.to_string().contains("STALE_INTERACTIVE_VIEW") +} + +fn is_stale_visual_mark_view_error(err: &BitFunError) -> bool { + err.to_string().contains("STALE_VISUAL_MARK_VIEW") +} + +impl DesktopComputerUseHost { + /// Return the image-pixel center `(x, y)` of the cached interactive + /// element with the given `i`, when its `frame_image` is known. Used + /// as a pointer-click fallback in `interactive_click` when AXPress + /// fails (Electron / Canvas / custom-drawn surfaces). + #[cfg(target_os = "macos")] + async fn cached_interactive_image_center( + &self, + app: &AppSelector, + i: u32, + ) -> Option<(i32, i32)> { + let pid = resolve_pid_macos(self, app).await.ok()?; + let s = self.state.lock().ok()?; + let cached = s.interactive_view_cache.get(&pid)?; + let el = cached.elements.iter().find(|e| e.i == i)?; + let (ix, iy, iw, ih) = el.frame_image?; + Some(( + (ix as i64 + (iw as i64) / 2) as i32, + (iy as i64 + (ih as i64) / 2) as i32, + )) + } + + /// Resolve an `interactive_*` `i` index into the underlying AX `node_idx` + /// using the per-pid cache populated by `build_interactive_view`. Returns + /// a `STALE_INTERACTIVE_VIEW` tool error when the digest no longer matches + /// (i.e. the UI changed between view + action) so the caller can re-build + /// the interactive view before retrying. + #[cfg(target_os = "macos")] + async fn resolve_interactive_index( + &self, + app: &AppSelector, + i: u32, + before_digest: Option<&str>, + ) -> BitFunResult { + let pid = resolve_pid_macos(self, app).await?; + let s = self + .state + .lock() + .map_err(|e| BitFunError::tool(format!("lock: {}", e)))?; + let cached = s.interactive_view_cache.get(&pid).ok_or_else(|| { + BitFunError::tool( + "INTERACTIVE_VIEW_MISSING: call `build_interactive_view` before `interactive_*` actions" + .to_string(), + ) + })?; + if let Some(want) = before_digest { + let want = want.trim(); + if !want.is_empty() { + let matches = if want.len() >= 8 && want.len() <= cached.digest.len() { + cached.digest.starts_with(want) + } else { + want == cached.digest + }; + if !matches { + return Err(BitFunError::tool(format!( + "STALE_INTERACTIVE_VIEW: before_view_digest={} but current cached digest={}; re-call `build_interactive_view` and reuse the new digest (full or >=8-char prefix)", + want, cached.digest + ))); + } + } + } + let el = cached.elements.iter().find(|e| e.i == i).ok_or_else(|| { + BitFunError::tool(format!( + "INTERACTIVE_INDEX_OUT_OF_RANGE: i={} not in cached view (len={})", + i, + cached.elements.len() + )) + })?; + Ok(el.node_idx) + } + + #[cfg(target_os = "macos")] + async fn resolve_visual_mark( + &self, + app: &AppSelector, + i: u32, + before_digest: Option<&str>, + ) -> BitFunResult { + let pid = resolve_pid_macos(self, app).await?; + let s = self + .state + .lock() + .map_err(|e| BitFunError::tool(format!("lock: {}", e)))?; + let cached = s.visual_mark_cache.get(&pid).ok_or_else(|| { + BitFunError::tool( + "VISUAL_MARK_VIEW_MISSING: call `build_visual_mark_view` before `visual_click`" + .to_string(), + ) + })?; + if let Some(want) = before_digest { + let want = want.trim(); + if !want.is_empty() { + let matches = if want.len() >= 8 && want.len() <= cached.digest.len() { + cached.digest.starts_with(want) + } else { + want == cached.digest + }; + if !matches { + return Err(BitFunError::tool(format!( + "STALE_VISUAL_MARK_VIEW: before_view_digest={} but current cached digest={}; re-call `build_visual_mark_view` and reuse the new digest (full or >=8-char prefix)", + want, cached.digest + ))); + } + } + } + cached + .marks + .iter() + .find(|mark| mark.i == i) + .cloned() + .ok_or_else(|| { + BitFunError::tool(format!( + "VISUAL_INDEX_OUT_OF_RANGE: i={} not in cached visual mark view (len={})", + i, + cached.marks.len() + )) + }) + } } diff --git a/src/apps/desktop/src/computer_use/interactive_filter.rs b/src/apps/desktop/src/computer_use/interactive_filter.rs new file mode 100644 index 000000000..cdab7f3e3 --- /dev/null +++ b/src/apps/desktop/src/computer_use/interactive_filter.rs @@ -0,0 +1,533 @@ +//! Filter a Codex-style [`AxNode`] tree into a Set-of-Mark +//! [`InteractiveElement`] list (TuriX-CUA inspired). +//! +//! The model's job is "pick a number" — to make that work we need: +//! 1. Drop non-interactive containers (groups, scroll areas, generic AXGroup). +//! 2. Drop nodes with zero / off-screen frames. +//! 3. Sort deterministically so the same UI always yields the same `i`. +//! 4. Assign dense `i` indices (0, 1, 2, …). +//! 5. Project each global frame to JPEG image pixel coordinates so the +//! overlay renderer knows where to paint the numbered box. +//! +//! Image projection uses [`ComputerScreenshot::image_global_bounds`] when +//! present (the host fills it for both full-display and crop-around-window +//! captures), falling back to a conservative "skip the box" when bounds +//! are unknown — better to omit a label than to paint it on the wrong +//! widget. + +#![allow(dead_code)] + +use bitfun_core::agentic::tools::computer_use_host::{ + AxNode, ComputerScreenshot, InteractiveElement, +}; + +/// Per-host filter knobs. +#[derive(Debug, Clone)] +pub(crate) struct FilterOpts { + /// Hard cap on emitted elements. The filter keeps the largest-area + /// elements when exceeded so the overlay stays legible. + pub max_elements: usize, + /// When `true`, only elements whose frame intersects the focused + /// window's image rectangle are kept. The host passes the rectangle + /// via `image_global_bounds`; when bounds are missing we keep + /// everything. + pub clip_to_image_bounds: bool, +} + +impl Default for FilterOpts { + fn default() -> Self { + Self { + max_elements: 80, + clip_to_image_bounds: true, + } + } +} + +/// Build the SoM element list from a raw AX dump + the focused-window +/// screenshot the host already captured. The returned vector is sorted +/// deterministically and densely indexed (`elements[k].i == k as u32`). +pub(crate) fn build_interactive_elements( + nodes: &[AxNode], + screenshot: Option<&ComputerScreenshot>, + opts: &FilterOpts, +) -> Vec { + let mut staged: Vec = Vec::with_capacity(nodes.len() / 4); + + for n in nodes { + if !is_interactive(n) { + continue; + } + let Some(frame) = n.frame_global else { + continue; + }; + let (gx, gy, gw, gh) = frame; + if gw < 4.0 || gh < 4.0 { + continue; + } + + let frame_image = screenshot + .and_then(|s| project_global_to_image(s, gx, gy, gw, gh, opts.clip_to_image_bounds)); + + // When clipping is requested and the host provided bounds, drop + // anything that falls entirely outside the captured rectangle. + if opts.clip_to_image_bounds { + if let Some(s) = screenshot { + if s.image_global_bounds.is_some() && frame_image.is_none() { + continue; + } + } + } + + staged.push(Staged { + node_idx: n.idx, + role: n.role.clone(), + subrole: n.subrole.clone(), + label: best_label(n), + frame_global: frame, + frame_image, + enabled: n.enabled, + focused: n.focused, + ax_actionable: n.actions.iter().any(|a| { + matches!( + a.as_str(), + "AXPress" | "AXConfirm" | "AXOpen" | "AXShowMenu" | "AXPick" + ) + }), + area: (gw * gh) as f64, + }); + } + + // Card-merge heuristic: when an actionable container (AXCell / AXRow / + // AXButton / AXLink / AXGroup-with-AXPress) geometrically contains + // smaller actionable children that are themselves actionable, drop + // the children. Without this the SoM overlay shows 3-5 stacked + // numbers on a single card (icon + label + cell) and the model has + // to guess which one actually fires the navigation. Keep the card. + // + // Containment rule: parent area is at least 1.5x the child, and the + // child rectangle is fully (with 2pt slop) inside the parent. + if staged.len() > 1 { + let originals = staged.clone(); + staged.retain(|child| { + let (cx, cy, cw, ch) = child.frame_global; + !originals.iter().any(|parent| { + if parent.node_idx == child.node_idx { + return false; + } + if !is_card_container(&parent.role) { + return false; + } + if parent.area < child.area * 1.5 { + return false; + } + let (px, py, pw, ph) = parent.frame_global; + cx + 2.0 >= px + && cy + 2.0 >= py + && cx + cw <= px + pw + 2.0 + && cy + ch <= py + ph + 2.0 + }) + }); + } + + // Stable deterministic sort: top-to-bottom, then left-to-right. + // Buckets of 16pt eliminate jitter from baseline differences between + // controls on the same row. + staged.sort_by(|a, b| { + let (ax, ay, _, _) = a.frame_global; + let (bx, by, _, _) = b.frame_global; + let ay_b = (ay / 16.0).floor() as i64; + let by_b = (by / 16.0).floor() as i64; + ay_b.cmp(&by_b) + .then_with(|| ax.partial_cmp(&bx).unwrap_or(std::cmp::Ordering::Equal)) + .then_with(|| a.node_idx.cmp(&b.node_idx)) + }); + + if staged.len() > opts.max_elements { + // Keep the largest-area elements so the overlay stays readable on + // dense pages. We still preserve the deterministic display order + // afterwards by re-sorting the kept slice. + let mut by_area = staged; + by_area.sort_by(|a, b| b.area.partial_cmp(&a.area).unwrap_or(std::cmp::Ordering::Equal)); + by_area.truncate(opts.max_elements); + by_area.sort_by(|a, b| { + let (ax, ay, _, _) = a.frame_global; + let (bx, by, _, _) = b.frame_global; + let ay_b = (ay / 16.0).floor() as i64; + let by_b = (by / 16.0).floor() as i64; + ay_b.cmp(&by_b) + .then_with(|| ax.partial_cmp(&bx).unwrap_or(std::cmp::Ordering::Equal)) + .then_with(|| a.node_idx.cmp(&b.node_idx)) + }); + staged = by_area; + } + + staged + .into_iter() + .enumerate() + .map(|(i, s)| InteractiveElement { + i: i as u32, + node_idx: s.node_idx, + role: s.role, + subrole: s.subrole, + label: s.label, + frame_image: s.frame_image, + frame_global: Some(s.frame_global), + enabled: s.enabled, + focused: s.focused, + ax_actionable: s.ax_actionable, + }) + .collect() +} + +/// Render a compact one-line-per-element text rendering used in the model +/// prompt alongside the annotated screenshot. +pub(crate) fn render_element_tree_text(elements: &[InteractiveElement]) -> String { + let mut out = String::with_capacity(elements.len() * 64); + for e in elements { + let label = e.label.as_deref().unwrap_or(""); + let role = display_role(&e.role, e.subrole.as_deref()); + let mut line = format!("[{}] {} \"{}\"", e.i, role, label); + if e.focused { + line.push_str(" [focused]"); + } + if !e.enabled { + line.push_str(" [disabled]"); + } + if !e.ax_actionable { + line.push_str(" [pointer-only]"); + } + out.push_str(&line); + out.push('\n'); + } + out +} + +/// Roles eligible to "absorb" smaller actionable descendants in the SoM +/// overlay. Anything else (text fields, sliders, menu items …) keeps its +/// children visible — those tend to need direct interaction at the leaf. +fn is_card_container(role: &str) -> bool { + matches!( + role, + "AXCell" + | "AXRow" + | "AXOutlineRow" + | "AXButton" + | "AXMenuButton" + | "AXPopUpButton" + | "AXLink" + | "AXGroup" + ) +} + +#[derive(Clone)] +struct Staged { + node_idx: u32, + role: String, + subrole: Option, + label: Option, + frame_global: (f64, f64, f64, f64), + frame_image: Option<(u32, u32, u32, u32)>, + enabled: bool, + focused: bool, + ax_actionable: bool, + area: f64, +} + +/// Heuristic — keep elements a sighted user would consider "clickable" / +/// "fillable" / "selectable", and explicit text containers that are large +/// enough to be primary targets (so the model can disambiguate "the +/// button labelled X" from "the row labelled X" when both exist). +fn is_interactive(n: &AxNode) -> bool { + if !n.enabled { + return false; + } + let role = n.role.as_str(); + + // Always interactive roles. + matches!( + role, + "AXButton" + | "AXMenuButton" + | "AXPopUpButton" + | "AXCheckBox" + | "AXRadioButton" + | "AXSwitch" + | "AXToggle" + | "AXTextField" + | "AXSecureTextField" + | "AXSearchField" + | "AXTextArea" + | "AXComboBox" + | "AXLink" + | "AXTab" + | "AXTabGroup" + | "AXSlider" + | "AXIncrementor" + | "AXStepper" + | "AXMenu" + | "AXMenuItem" + | "AXMenuBarItem" + | "AXDisclosureTriangle" + | "AXRow" + | "AXOutlineRow" + | "AXCell" + ) || + // Or: any node that exposes an actionable AX action. + n.actions.iter().any(|a| { + matches!( + a.as_str(), + "AXPress" | "AXConfirm" | "AXOpen" | "AXShowMenu" | "AXPick" | "AXIncrement" | "AXDecrement" + ) + }) +} + +fn best_label(n: &AxNode) -> Option { + for cand in [&n.title, &n.description, &n.help, &n.value, &n.identifier] { + if let Some(s) = cand { + let trimmed = s.trim(); + if !trimmed.is_empty() { + return Some(clip(trimmed, 80)); + } + } + } + None +} + +fn clip(s: &str, max_chars: usize) -> String { + let mut out: String = s.chars().take(max_chars).collect(); + if s.chars().count() > max_chars { + out.push('…'); + } + out +} + +fn display_role(role: &str, subrole: Option<&str>) -> String { + let stripped = role.strip_prefix("AX").unwrap_or(role); + match subrole { + Some(sr) if !sr.is_empty() => { + let sr_stripped = sr.strip_prefix("AX").unwrap_or(sr); + format!("{}({})", stripped, sr_stripped) + } + _ => stripped.to_string(), + } +} + +/// Project a global pointer-space rectangle onto the JPEG image pixel +/// grid. Returns `None` when the screenshot has no `image_global_bounds` +/// (host could not resolve the mapping), or the rectangle falls entirely +/// outside the captured area. +fn project_global_to_image( + shot: &ComputerScreenshot, + gx: f64, + gy: f64, + gw: f64, + gh: f64, + require_intersection: bool, +) -> Option<(u32, u32, u32, u32)> { + let bounds = shot.image_global_bounds.as_ref()?; + if bounds.width <= 0.0 || bounds.height <= 0.0 { + return None; + } + + let scale_x = shot.image_width as f64 / bounds.width; + let scale_y = shot.image_height as f64 / bounds.height; + + // Clip the global rectangle to the image rectangle. + let lx = gx.max(bounds.left); + let ty = gy.max(bounds.top); + let rx = (gx + gw).min(bounds.left + bounds.width); + let by = (gy + gh).min(bounds.top + bounds.height); + if rx <= lx || by <= ty { + if require_intersection { + return None; + } + // No intersection but caller wants a best-effort projection — fall + // through using the unclipped rectangle so the overlay can decide + // whether to draw a clipped marker. + let ix = ((gx - bounds.left) * scale_x).round(); + let iy = ((gy - bounds.top) * scale_y).round(); + let iw = (gw * scale_x).round().max(1.0); + let ih = (gh * scale_y).round().max(1.0); + return Some(( + ix.max(0.0) as u32, + iy.max(0.0) as u32, + iw as u32, + ih as u32, + )); + } + + let ix = ((lx - bounds.left) * scale_x).round(); + let iy = ((ty - bounds.top) * scale_y).round(); + let iw = ((rx - lx) * scale_x).round().max(1.0); + let ih = ((by - ty) * scale_y).round().max(1.0); + + let max_x = shot.image_width.saturating_sub(1) as f64; + let max_y = shot.image_height.saturating_sub(1) as f64; + Some(( + ix.max(0.0).min(max_x) as u32, + iy.max(0.0).min(max_y) as u32, + iw as u32, + ih as u32, + )) +} + +#[cfg(test)] +mod tests { + use super::*; + use bitfun_core::agentic::tools::computer_use_host::ComputerUseImageGlobalBounds; + + fn node(idx: u32, role: &str, frame: Option<(f64, f64, f64, f64)>) -> AxNode { + AxNode { + idx, + parent_idx: None, + role: role.to_string(), + title: Some(format!("label-{idx}")), + value: None, + description: None, + identifier: None, + enabled: true, + focused: false, + selected: None, + frame_global: frame, + actions: vec!["AXPress".into()], + role_description: None, + subrole: None, + help: None, + url: None, + expanded: None, + } + } + + fn screenshot() -> ComputerScreenshot { + ComputerScreenshot { + screenshot_id: Some("test-shot".to_string()), + bytes: vec![], + mime_type: "image/jpeg".to_string(), + image_width: 1000, + image_height: 800, + native_width: 2000, + native_height: 1600, + display_origin_x: 0, + display_origin_y: 0, + vision_scale: 0.5, + pointer_image_x: None, + pointer_image_y: None, + screenshot_crop_center: None, + point_crop_half_extent_native: None, + navigation_native_rect: None, + quadrant_navigation_click_ready: false, + image_content_rect: None, + image_global_bounds: Some(ComputerUseImageGlobalBounds { + left: 0.0, + top: 0.0, + width: 500.0, + height: 400.0, + }), + ui_tree_text: None, + implicit_confirmation_crop_applied: false, + } + } + + #[test] + fn drops_non_interactive_and_off_screen_nodes() { + let mut group = node(0, "AXGroup", Some((0.0, 0.0, 100.0, 100.0))); + group.actions.clear(); + let nodes = vec![ + group, + node(1, "AXButton", Some((10.0, 10.0, 50.0, 30.0))), + node(2, "AXButton", None), + node(3, "AXButton", Some((1.0, 1.0, 2.0, 2.0))), + ]; + let opts = FilterOpts::default(); + let out = build_interactive_elements(&nodes, Some(&screenshot()), &opts); + assert_eq!(out.len(), 1); + assert_eq!(out[0].i, 0); + assert_eq!(out[0].node_idx, 1); + } + + #[test] + fn projects_frame_to_image_pixels_with_scale() { + let nodes = vec![node(0, "AXButton", Some((100.0, 80.0, 50.0, 40.0)))]; + let out = build_interactive_elements(&nodes, Some(&screenshot()), &FilterOpts::default()); + let (ix, iy, iw, ih) = out[0].frame_image.expect("frame_image present"); + // bounds 500x400 → image 1000x800 → 2x scale on both axes. + assert_eq!(ix, 200); + assert_eq!(iy, 160); + assert_eq!(iw, 100); + assert_eq!(ih, 80); + } + + #[test] + fn dense_indices_in_top_to_bottom_order() { + let nodes = vec![ + node(0, "AXButton", Some((400.0, 200.0, 30.0, 20.0))), + node(1, "AXButton", Some((100.0, 100.0, 30.0, 20.0))), + node(2, "AXButton", Some((50.0, 200.0, 30.0, 20.0))), + ]; + let out = build_interactive_elements(&nodes, Some(&screenshot()), &FilterOpts::default()); + assert_eq!(out.len(), 3); + assert_eq!(out[0].node_idx, 1); // top row + assert_eq!(out[1].node_idx, 2); // bottom-left + assert_eq!(out[2].node_idx, 0); // bottom-right + for (k, e) in out.iter().enumerate() { + assert_eq!(e.i, k as u32); + } + } + + #[test] + fn caps_at_max_elements() { + let nodes: Vec<_> = (0..10) + .map(|k| node(k, "AXButton", Some((k as f64 * 50.0, 10.0, 30.0, 20.0)))) + .collect(); + let opts = FilterOpts { + max_elements: 4, + ..FilterOpts::default() + }; + let out = build_interactive_elements(&nodes, Some(&screenshot()), &opts); + assert_eq!(out.len(), 4); + } + + #[test] + fn card_container_absorbs_contained_actionable_children() { + // Outer cell (large) + inner button + inner static-text-as-button, + // all actionable. Card-merge should keep the cell only. + let cell = node(10, "AXCell", Some((0.0, 0.0, 300.0, 80.0))); + let inner_btn = node(11, "AXButton", Some((10.0, 10.0, 60.0, 60.0))); + let inner_btn2 = node(12, "AXButton", Some((100.0, 20.0, 100.0, 30.0))); + // Sibling button outside the cell stays. + let outside = node(13, "AXButton", Some((400.0, 0.0, 50.0, 30.0))); + let nodes = vec![cell, inner_btn, inner_btn2, outside]; + let out = build_interactive_elements(&nodes, Some(&screenshot()), &FilterOpts::default()); + let kept_idx: Vec = out.iter().map(|e| e.node_idx).collect(); + assert!(kept_idx.contains(&10), "cell must survive: {:?}", kept_idx); + assert!( + kept_idx.contains(&13), + "outside btn must survive: {:?}", + kept_idx + ); + assert!( + !kept_idx.contains(&11), + "inner btn 11 must be absorbed: {:?}", + kept_idx + ); + assert!( + !kept_idx.contains(&12), + "inner btn 12 must be absorbed: {:?}", + kept_idx + ); + } + + #[test] + fn render_text_lists_one_per_line() { + let nodes = vec![ + node(0, "AXButton", Some((10.0, 10.0, 30.0, 20.0))), + node(1, "AXTextField", Some((10.0, 50.0, 100.0, 20.0))), + ]; + let elements = + build_interactive_elements(&nodes, Some(&screenshot()), &FilterOpts::default()); + let text = render_element_tree_text(&elements); + let mut lines = text.lines(); + assert_eq!(lines.next(), Some("[0] Button \"label-0\"")); + assert_eq!(lines.next(), Some("[1] TextField \"label-1\"")); + } +} diff --git a/src/apps/desktop/src/computer_use/linux_ax_ui.rs b/src/apps/desktop/src/computer_use/linux_ax_ui.rs index f23e6e270..2e931c5f7 100644 --- a/src/apps/desktop/src/computer_use/linux_ax_ui.rs +++ b/src/apps/desktop/src/computer_use/linux_ax_ui.rs @@ -33,6 +33,15 @@ pub async fn locate_ui_element_center( query: UiElementLocateQuery, ) -> BitFunResult { ui_locate_common::validate_query(&query)?; + + if query.node_idx.is_some() { + return Err(BitFunError::tool( + "[AX_IDX_NOT_SUPPORTED] node_idx lookup is only implemented on macOS. \ + Fall back to `text_contains` / `title_contains` + `role_substring` on this host." + .to_string(), + )); + } + let max_depth = query.max_depth.unwrap_or(48).clamp(1, 200); let max_nodes = 12_000usize; @@ -76,13 +85,22 @@ pub async fn locate_ui_element_center( let name = acc.name().await.unwrap_or_default(); let ident = acc.accessible_id().await.unwrap_or_default(); let role = role_match_string(&acc).await; + let description = acc.description().await.unwrap_or_default(); - let matched = ui_locate_common::matches_filters( - &query, - Some(role.as_str()), - Some(name.as_str()), - Some(ident.as_str()), - ); + let attrs = ui_locate_common::NodeAttrs { + role: Some(role.as_str()), + subrole: None, + title: Some(name.as_str()), + value: None, + description: if description.is_empty() { + None + } else { + Some(description.as_str()) + }, + identifier: Some(ident.as_str()), + help: None, + }; + let matched = ui_locate_common::matches_filters_attrs(&query, &attrs); if matched { if let Some((x, y, w, h)) = component_extents_screen(&acc).await { if w > 0 && h > 0 { diff --git a/src/apps/desktop/src/computer_use/macos_ax_dump.rs b/src/apps/desktop/src/computer_use/macos_ax_dump.rs new file mode 100644 index 000000000..9cb11d6a4 --- /dev/null +++ b/src/apps/desktop/src/computer_use/macos_ax_dump.rs @@ -0,0 +1,794 @@ +//! Codex-style macOS Accessibility (AX) tree dump. +//! +//! Walks an application's full AX tree (BFS) starting from a `pid`, emits: +//! * a human-readable indented `tree_text` (Codex parity), +//! * a structured `Vec` with stable, monotonic `idx` values, +//! * a sha1 `digest` over the structural fingerprint so callers can detect +//! "did anything change?" cheaply, +//! * a per-pid cache mapping `idx → AXUIElementRef` so subsequent +//! `app_click` / `app_type_text` / ... actions can resolve a numeric idx +//! back to a live AX element without re-walking. +//! +//! All AX refs returned in the cache are `CFRetain`-ed and released when +//! the snapshot for that pid is replaced. + +// Symbols here are wired up by the ControlHub `desktop.*` dispatch layer in a +// follow-up step (`controlhub-actions`). Until then, suppress dead-code lints +// without weakening real warnings elsewhere. +#![allow(dead_code)] + +use bitfun_core::agentic::tools::computer_use_host::{AppStateSnapshot, AxNode}; +use bitfun_core::util::errors::{BitFunError, BitFunResult}; +use core_foundation::array::{CFArray, CFArrayRef}; +use core_foundation::base::{CFGetTypeID, CFTypeRef, TCFType}; +use core_foundation::boolean::{CFBooleanGetTypeID, CFBooleanRef}; +use core_foundation::string::{CFString, CFStringRef}; +use core_graphics::geometry::{CGPoint, CGSize}; +use sha1::{Digest, Sha1}; +use std::collections::{HashMap, VecDeque}; +use std::ffi::c_void; +use std::sync::{Mutex, OnceLock}; +use std::time::{SystemTime, UNIX_EPOCH}; + +type CFNumberRef = *const c_void; +type CFTypeID = usize; +const K_CF_NUMBER_DOUBLE_TYPE: i32 = 13; +const K_CF_NUMBER_LONG_LONG_TYPE: i32 = 11; + +type AXUIElementRef = *const c_void; +type AXValueRef = *const c_void; + +#[link(name = "ApplicationServices", kind = "framework")] +unsafe extern "C" { + fn AXUIElementCreateApplication(pid: i32) -> AXUIElementRef; + fn AXUIElementCopyAttributeValue( + element: AXUIElementRef, + attribute: CFStringRef, + value: *mut CFTypeRef, + ) -> i32; + fn AXUIElementCopyActionNames(element: AXUIElementRef, names: *mut CFArrayRef) -> i32; + fn AXValueGetType(value: AXValueRef) -> u32; + fn AXValueGetValue(value: AXValueRef, the_type: u32, ptr: *mut c_void) -> bool; +} + +#[link(name = "CoreFoundation", kind = "framework")] +unsafe extern "C" { + fn CFRetain(cf: CFTypeRef) -> CFTypeRef; + fn CFBooleanGetValue(boolean: CFBooleanRef) -> u8; + fn CFStringGetTypeID() -> CFTypeID; + fn CFNumberGetTypeID() -> CFTypeID; + fn CFNumberIsFloatType(number: CFNumberRef) -> u8; + fn CFNumberGetValue(number: CFNumberRef, the_type: i32, value_ptr: *mut c_void) -> u8; +} + +const K_AX_VALUE_CGPOINT: u32 = 1; +const K_AX_VALUE_CGSIZE: u32 = 2; + +// ── Wrappers around raw pointers so we can stash them in `Send`-able caches ─ + +/// Newtype wrapping `AXUIElementRef`. Manually implements `Send + Sync` — +/// AX refs are CF objects, safe to share across threads as long as we only +/// drop them with `CFRelease`. The cache is internally locked. +#[derive(Copy, Clone)] +pub(crate) struct AxRef(pub AXUIElementRef); +unsafe impl Send for AxRef {} +unsafe impl Sync for AxRef {} + +impl AxRef { + fn release(self) { + if !self.0.is_null() { + unsafe { core_foundation::base::CFRelease(self.0 as CFTypeRef) }; + } + } +} + +// ── Per-pid cache: snapshot id → idx → retained AXUIElementRef ───────────── +// +// We keep the most recent snapshot per pid only; resolving a stale `idx` +// against an old snapshot returns `None`, which the dispatch layer maps to +// `AX_NODE_STALE`. + +struct CachedSnapshot { + digest: String, + refs: Vec, +} + +impl Drop for CachedSnapshot { + fn drop(&mut self) { + for r in self.refs.drain(..) { + r.release(); + } + } +} + +static SNAPSHOT_CACHE: OnceLock>> = OnceLock::new(); + +fn snapshot_cache() -> &'static Mutex> { + SNAPSHOT_CACHE.get_or_init(|| Mutex::new(HashMap::new())) +} + +/// Resolve `(pid, idx)` to a live AX ref. Caller must NOT release it; the +/// cache owns the retain. Returns `None` if the snapshot has been replaced +/// (i.e. the digest no longer matches) or the idx is out of range. +pub(crate) fn cached_ref(pid: i32, expected_digest: Option<&str>, idx: u32) -> Option { + let cache = snapshot_cache().lock().ok()?; + let snap = cache.get(&pid)?; + if let Some(want) = expected_digest { + if snap.digest != want { + return None; + } + } + snap.refs.get(idx as usize).copied() +} + +/// Like `cached_ref` but does not require a digest match. Used for +/// best-effort follow-up actions where the caller did not have a chance to +/// re-snapshot (e.g. `app_wait_for` polling). +pub(crate) fn cached_ref_loose(pid: i32, idx: u32) -> Option { + cached_ref(pid, None, idx) +} + +// ── Low-level CF / AX helpers (intentionally separate from macos_ax_ui.rs +// to keep the older locate path self-contained and untouched) ────────── + +unsafe fn ax_release(v: CFTypeRef) { + if !v.is_null() { + core_foundation::base::CFRelease(v); + } +} + +unsafe fn ax_copy_attr(elem: AXUIElementRef, key: &str) -> Option { + let mut val: CFTypeRef = std::ptr::null(); + let k = CFString::new(key); + let st = AXUIElementCopyAttributeValue(elem, k.as_concrete_TypeRef(), &mut val); + if st != 0 || val.is_null() { + if !val.is_null() { + ax_release(val); + } + return None; + } + Some(val) +} + +/// Safely convert a CF object to a Rust `String`. **MUST type-check first**: +/// blindly wrapping a non-CFString as `CFStringRef` and calling `.to_string()` +/// dispatches `_fastCStringContents:` to whatever class the object actually +/// is, raising an Objective-C `NSException` (`unrecognized selector …`) that +/// unwinds across the FFI boundary and either aborts the process or, if +/// caught, simply blanks out the entire AX snapshot. +/// +/// This is the canonical foot-gun on Tauri / Electron / WebKit-hosted apps, +/// where `AXValue` on tabs is the selected child *element*, on toggles is a +/// `CFNumber`, on bool attributes is a `CFBoolean`, and on geometric +/// attributes is an opaque `AXValueRef` — none of which are strings. +unsafe fn cfstring_to_string(cf: CFTypeRef) -> Option { + if cf.is_null() { + return None; + } + if CFGetTypeID(cf) != CFStringGetTypeID() { + return None; + } + let s = CFString::wrap_under_get_rule(cf as CFStringRef); + Some(s.to_string()) +} + +/// Best-effort: read an attribute and coerce *whatever* CF type comes back +/// into a printable string — strings stay verbatim, booleans become +/// `"true"`/`"false"`, numbers become decimal, AX value refs (CGPoint / +/// CGSize / CGRect) become `(x, y)` / `(w x h)` / `(x, y, w, h)`. Anything +/// else (e.g. an AXUIElementRef returned for `AXValue` on a tab group) +/// becomes `None` rather than blowing up. +unsafe fn cf_to_display_string(cf: CFTypeRef) -> Option { + if cf.is_null() { + return None; + } + let tid = CFGetTypeID(cf); + if tid == CFStringGetTypeID() { + let s = CFString::wrap_under_get_rule(cf as CFStringRef); + return Some(s.to_string()); + } + if tid == CFBooleanGetTypeID() { + return Some(if CFBooleanGetValue(cf as CFBooleanRef) != 0 { + "true".to_string() + } else { + "false".to_string() + }); + } + if tid == CFNumberGetTypeID() { + let nref = cf as CFNumberRef; + if CFNumberIsFloatType(nref) != 0 { + let mut d: f64 = 0.0; + if CFNumberGetValue( + nref, + K_CF_NUMBER_DOUBLE_TYPE, + &mut d as *mut _ as *mut c_void, + ) != 0 + { + // Trim trailing zeros for cleaner display (1.0 → "1"). + let s = format!("{}", d); + return Some(s); + } + return None; + } else { + let mut i: i64 = 0; + if CFNumberGetValue( + nref, + K_CF_NUMBER_LONG_LONG_TYPE, + &mut i as *mut _ as *mut c_void, + ) != 0 + { + return Some(i.to_string()); + } + return None; + } + } + // CGPoint / CGSize / CGRect / CFRange via AXValueRef. + if let Some(p) = ax_value_to_point(cf) { + return Some(format!("({}, {})", p.x, p.y)); + } + if let Some(s) = ax_value_to_size(cf) { + return Some(format!("({} x {})", s.width, s.height)); + } + None +} + +unsafe fn read_cf_string_attr(elem: AXUIElementRef, key: &str) -> Option { + let v = ax_copy_attr(elem, key)?; + let s = cfstring_to_string(v); + ax_release(v); + s +} + +/// Like `read_cf_string_attr` but accepts numbers / booleans / AXValues too +/// (used for `AXValue`, which on macOS can be almost anything depending on +/// the role). +unsafe fn read_cf_value_attr(elem: AXUIElementRef, key: &str) -> Option { + let v = ax_copy_attr(elem, key)?; + let s = cf_to_display_string(v); + ax_release(v); + s +} + +unsafe fn read_cf_bool_attr(elem: AXUIElementRef, key: &str) -> Option { + let v = ax_copy_attr(elem, key)?; + let mut out = None; + if CFGetTypeID(v) == CFBooleanGetTypeID() { + out = Some(CFBooleanGetValue(v as CFBooleanRef) != 0); + } + ax_release(v); + out +} + +/// Returns `Some(point)` only if `v` is a non-null AXValueRef encoding a +/// CGPoint. Safe to call on any CFTypeRef — non-AXValue inputs return `None`. +unsafe fn ax_value_to_point(v: CFTypeRef) -> Option { + if v.is_null() { + return None; + } + let av = v as AXValueRef; + if AXValueGetType(av) != K_AX_VALUE_CGPOINT { + return None; + } + let mut pt = CGPoint { x: 0.0, y: 0.0 }; + if !AXValueGetValue(av, K_AX_VALUE_CGPOINT, &mut pt as *mut _ as *mut c_void) { + return None; + } + Some(pt) +} + +unsafe fn ax_value_to_size(v: CFTypeRef) -> Option { + if v.is_null() { + return None; + } + let av = v as AXValueRef; + if AXValueGetType(av) != K_AX_VALUE_CGSIZE { + return None; + } + let mut sz = CGSize { + width: 0.0, + height: 0.0, + }; + if !AXValueGetValue(av, K_AX_VALUE_CGSIZE, &mut sz as *mut _ as *mut c_void) { + return None; + } + Some(sz) +} + +unsafe fn read_global_frame(elem: AXUIElementRef) -> Option<(f64, f64, f64, f64)> { + let pos = ax_copy_attr(elem, "AXPosition")?; + let size = ax_copy_attr(elem, "AXSize")?; + let pt = ax_value_to_point(pos); + let sz = ax_value_to_size(size); + ax_release(pos); + ax_release(size); + let pt = pt?; + let sz = sz?; + Some((pt.x, pt.y, sz.width, sz.height)) +} + +unsafe fn read_action_names(elem: AXUIElementRef) -> Vec { + let mut names: CFArrayRef = std::ptr::null(); + let st = AXUIElementCopyActionNames(elem, &mut names); + if st != 0 || names.is_null() { + return vec![]; + } + let arr = CFArray::<*const c_void>::wrap_under_create_rule(names); + let mut out = Vec::with_capacity(arr.len() as usize); + for i in 0..arr.len() { + if let Some(s) = arr.get(i) { + let p = *s; + if !p.is_null() { + out.push(CFString::wrap_under_get_rule(p as CFStringRef).to_string()); + } + } + } + out +} + +// ── BFS walker ──────────────────────────────────────────────────────────── + +struct Queued { + elem: AXUIElementRef, + parent_idx: Option, + depth: u32, +} + +/// Configurable knobs for the dump. Defaults mirror what the dispatch layer +/// will call with: depth 32, focus_window_only false, capped at 4000 nodes. +pub struct DumpOpts { + pub max_depth: u32, + pub max_nodes: usize, + pub focus_window_only: bool, +} + +impl Default for DumpOpts { + fn default() -> Self { + Self { + max_depth: 32, + max_nodes: 4_000, + focus_window_only: false, + } + } +} + +pub fn dump_app_ax(pid: i32, opts: DumpOpts) -> BitFunResult { + let app = unsafe { AXUIElementCreateApplication(pid) }; + if app.is_null() { + return Err(BitFunError::tool(format!( + "AXUIElementCreateApplication returned null for pid={}", + pid + ))); + } + + // Pick the root we'll walk. + let root = if opts.focus_window_only { + unsafe { + try_focused_window(app).unwrap_or_else(|| { + // Retain the app element so we can drop both consistently. + CFRetain(app as CFTypeRef) as AXUIElementRef + }) + } + } else { + unsafe { CFRetain(app as CFTypeRef) as AXUIElementRef } + }; + + let window_title = unsafe { try_focused_window(app) }.and_then(|w| { + let t = unsafe { read_cf_string_attr(w, "AXTitle") }; + unsafe { ax_release(w as CFTypeRef) }; + t + }); + + // We're done with the app handle for now (root is independently retained). + unsafe { ax_release(app as CFTypeRef) }; + + let mut nodes: Vec = Vec::new(); + let mut refs: Vec = Vec::new(); + let mut queue: VecDeque = VecDeque::new(); + queue.push_back(Queued { + elem: root, + parent_idx: None, + depth: 0, + }); + let mut visited: usize = 0; + + while let Some(cur) = queue.pop_front() { + if cur.depth > opts.max_depth || visited >= opts.max_nodes { + unsafe { ax_release(cur.elem as CFTypeRef) }; + continue; + } + visited += 1; + + let idx = nodes.len() as u32; + let role = unsafe { read_cf_string_attr(cur.elem, "AXRole") }; + let role_description = unsafe { read_cf_string_attr(cur.elem, "AXRoleDescription") }; + let subrole = unsafe { read_cf_string_attr(cur.elem, "AXSubrole") }; + let title = unsafe { read_cf_string_attr(cur.elem, "AXTitle") }; + // AXValue is the canonical foot-gun: on a slider it's a CFNumber, on + // a toggle it's a CFBoolean, on a tab group it's an AXUIElementRef + // pointing at the selected child. Use the type-tolerant reader. + let value = unsafe { read_cf_value_attr(cur.elem, "AXValue") }; + let description = unsafe { read_cf_string_attr(cur.elem, "AXDescription") }; + let help = unsafe { read_cf_string_attr(cur.elem, "AXHelp") }; + let identifier = unsafe { read_cf_string_attr(cur.elem, "AXIdentifier") }; + let url = unsafe { read_cf_string_attr(cur.elem, "AXURL") }; + let enabled = unsafe { read_cf_bool_attr(cur.elem, "AXEnabled") }; + let focused = unsafe { read_cf_bool_attr(cur.elem, "AXFocused") }; + let selected = unsafe { read_cf_bool_attr(cur.elem, "AXSelected") }; + let expanded = unsafe { read_cf_bool_attr(cur.elem, "AXExpanded") }; + let frame = unsafe { read_global_frame(cur.elem) }; + let actions = unsafe { read_action_names(cur.elem) }; + + nodes.push(AxNode { + idx, + parent_idx: cur.parent_idx, + role: role.unwrap_or_default(), + title, + value, + description, + identifier, + enabled: enabled.unwrap_or(true), + focused: focused.unwrap_or(false), + selected, + frame_global: frame, + actions, + role_description, + subrole, + help, + url, + expanded, + }); + // Cache the retained ref so future actions can look it up. + refs.push(AxRef(cur.elem)); + + // Enqueue children — but DO NOT release `cur.elem`; the cache owns it. + let children_ref = unsafe { ax_copy_attr(cur.elem, "AXChildren") }; + let next_depth = cur.depth + 1; + let Some(ch) = children_ref else { continue }; + unsafe { + let arr = CFArray::<*const c_void>::wrap_under_create_rule(ch as CFArrayRef); + for i in 0..arr.len() { + let Some(slot) = arr.get(i) else { continue }; + let child = *slot; + if child.is_null() { + continue; + } + let retained = CFRetain(child as CFTypeRef) as AXUIElementRef; + if !retained.is_null() { + queue.push_back(Queued { + elem: retained, + parent_idx: Some(idx), + depth: next_depth, + }); + } + } + } + } + // Drain anything we didn't walk (depth-cap or node-cap overflow). + while let Some(q) = queue.pop_front() { + unsafe { ax_release(q.elem as CFTypeRef) }; + } + + let tree_text = render_tree_text(&nodes); + let digest = compute_digest(&nodes); + let captured_at_ms = SystemTime::now() + .duration_since(UNIX_EPOCH) + .map(|d| d.as_millis() as u64) + .unwrap_or(0); + + // Install in cache, replacing any previous snapshot for this pid. + { + let mut cache = snapshot_cache() + .lock() + .map_err(|_| BitFunError::tool("AX snapshot cache poisoned".to_string()))?; + cache.insert( + pid, + CachedSnapshot { + digest: digest.clone(), + refs, + }, + ); + } + + Ok(AppStateSnapshot { + app: bitfun_core::agentic::tools::computer_use_host::AppInfo { + name: window_title.clone().unwrap_or_default(), + bundle_id: None, + pid: Some(pid), + running: true, + last_used_ms: None, + launch_count: 0, + }, + window_title, + tree_text, + nodes, + digest, + captured_at_ms, + screenshot: None, + loop_warning: None, + }) +} + +/// Best-effort: prefer `AXFocusedWindow`, then `AXMainWindow`. Returns a +/// retained ref the caller must release (or hand to the cache). +unsafe fn try_focused_window(app: AXUIElementRef) -> Option { + for key in ["AXFocusedWindow", "AXMainWindow"] { + if let Some(v) = ax_copy_attr(app, key) { + let elem = v as AXUIElementRef; + if !elem.is_null() { + return Some(elem); + } + ax_release(v); + } + } + None +} + +/// Render a Codex-style indented tree. +/// +/// Layout per node (one line): +/// +/// ```text +/// {indent}[{idx}] {label} title="…" value="…" id="…" desc="…" help="…" \ +/// url="…" frame=(x,y,wxh) {flags…} actions=[AXPress,AXShowMenu] +/// ``` +/// +/// `{label}` prefers `role_description` (humanised) over `role`+`subrole` +/// because that's what a sighted user calls the element. Numeric `idx` is +/// always shown so the model can address nodes deterministically. +fn render_tree_text(nodes: &[AxNode]) -> String { + let mut children: Vec> = vec![Vec::new(); nodes.len()]; + let mut roots: Vec = Vec::new(); + for n in nodes { + match n.parent_idx { + Some(p) => { + if let Some(slot) = children.get_mut(p as usize) { + slot.push(n.idx); + } + } + None => roots.push(n.idx), + } + } + let mut out = String::new(); + let mut stack: Vec<(u32, u32)> = roots.iter().rev().map(|&r| (r, 0u32)).collect(); + while let Some((idx, depth)) = stack.pop() { + let n = &nodes[idx as usize]; + for _ in 0..depth { + out.push_str(" "); + } + out.push_str(&format!("[{}] {}", n.idx, format_label(n))); + if let Some(t) = &n.title { + if !t.is_empty() { + out.push_str(&format!(" title={}", quote_clip(t, 120))); + } + } + if let Some(v) = &n.value { + if !v.is_empty() { + out.push_str(&format!(" value={}", quote_clip(v, 120))); + } + } + if let Some(id) = &n.identifier { + if !id.is_empty() { + out.push_str(&format!(" id={}", quote_clip(id, 80))); + } + } + if let Some(d) = &n.description { + if !d.is_empty() { + out.push_str(&format!(" desc={}", quote_clip(d, 120))); + } + } + if let Some(h) = &n.help { + if !h.is_empty() { + out.push_str(&format!(" help={}", quote_clip(h, 120))); + } + } + if let Some(u) = &n.url { + if !u.is_empty() { + out.push_str(&format!(" url={}", quote_clip(u, 200))); + } + } + if let Some((x, y, w, h)) = n.frame_global { + out.push_str(&format!(" frame=({:.0},{:.0},{:.0}x{:.0})", x, y, w, h)); + } + if !n.enabled { + out.push_str(" [disabled]"); + } + if n.focused { + out.push_str(" [focused]"); + } + if let Some(true) = n.selected { + out.push_str(" [selected]"); + } + match n.expanded { + Some(true) => out.push_str(" [expanded]"), + Some(false) => out.push_str(" [collapsed]"), + None => {} + } + // Surface non-trivial AX actions inline so the model can pick + // AXShowMenu / AXIncrement / AXDecrement etc. without re-querying. + let extra: Vec<&str> = n + .actions + .iter() + .map(String::as_str) + .filter(|a| !matches!(*a, "AXPress" | "AXShowAlternateUI" | "AXShowDefaultUI")) + .collect(); + if !extra.is_empty() { + out.push_str(&format!(" actions=[{}]", extra.join(","))); + } + out.push('\n'); + if let Some(kids) = children.get(idx as usize) { + for &c in kids.iter().rev() { + stack.push((c, depth + 1)); + } + } + } + out +} + +/// Compose a Codex-style label: prefer humanised role description, fall +/// back to `role + (subrole)`. +fn format_label(n: &AxNode) -> String { + if let Some(rd) = &n.role_description { + if !rd.is_empty() { + return rd.clone(); + } + } + match &n.subrole { + Some(s) if !s.is_empty() => format!("{}({})", n.role, s), + _ => n.role.clone(), + } +} + +/// Quote a value, clipping at `max` chars (counted in bytes for safety on +/// arbitrary UTF-8 — we cut on a char boundary so we never split a code +/// point). +fn quote_clip(s: &str, max: usize) -> String { + let trimmed: String = s.chars().take(max).collect(); + let escaped = trimmed.replace('\\', "\\\\").replace('"', "\\\""); + if s.chars().count() > max { + format!("\"{}…\"", escaped) + } else { + format!("\"{}\"", escaped) + } +} + +fn compute_digest(nodes: &[AxNode]) -> String { + let mut h = Sha1::new(); + for n in nodes { + h.update(n.idx.to_le_bytes()); + h.update(n.parent_idx.unwrap_or(u32::MAX).to_le_bytes()); + h.update(n.role.as_bytes()); + h.update(b"\x1f"); + h.update(n.subrole.as_deref().unwrap_or("").as_bytes()); + h.update(b"\x1f"); + h.update(n.title.as_deref().unwrap_or("").as_bytes()); + h.update(b"\x1f"); + h.update(n.identifier.as_deref().unwrap_or("").as_bytes()); + h.update(b"\x1f"); + h.update(n.description.as_deref().unwrap_or("").as_bytes()); + h.update(b"\x1f"); + h.update(n.help.as_deref().unwrap_or("").as_bytes()); + h.update(b"\x1f"); + h.update(n.value.as_deref().unwrap_or("").as_bytes()); + h.update(b"\x1f"); + h.update(n.url.as_deref().unwrap_or("").as_bytes()); + h.update(b"\x1f"); + h.update(match n.expanded { + Some(true) => b"E"[..].to_vec(), + Some(false) => b"C"[..].to_vec(), + None => Vec::new(), + }); + h.update(b"\x1f"); + for a in &n.actions { + h.update(a.as_bytes()); + h.update(b","); + } + h.update(b"\x1e"); + } + let bytes = h.finalize(); + let mut s = String::with_capacity(bytes.len() * 2); + for b in bytes { + s.push_str(&format!("{:02x}", b)); + } + s +} + +#[cfg(test)] +mod tests { + use super::*; + use bitfun_core::agentic::tools::computer_use_host::AxNode; + + fn n(idx: u32, parent: Option, role: &str, title: Option<&str>) -> AxNode { + AxNode { + idx, + parent_idx: parent, + role: role.to_string(), + title: title.map(str::to_string), + value: None, + description: None, + identifier: None, + enabled: true, + focused: false, + selected: None, + frame_global: None, + actions: vec![], + role_description: None, + subrole: None, + help: None, + url: None, + expanded: None, + } + } + + #[test] + fn render_tree_text_indents_by_depth_and_orders_siblings() { + let nodes = vec![ + n(0, None, "AXApplication", Some("Cursor")), + n(1, Some(0), "AXWindow", Some("main")), + n(2, Some(1), "AXButton", Some("Save")), + n(3, Some(1), "AXButton", Some("Close")), + ]; + let out = render_tree_text(&nodes); + let expected = + "[0] AXApplication title=\"Cursor\"\n [1] AXWindow title=\"main\"\n [2] AXButton title=\"Save\"\n [3] AXButton title=\"Close\"\n"; + assert_eq!(out, expected); + } + + #[test] + fn render_tree_text_uses_role_description_and_inline_flags() { + let mut a = n(0, None, "AXButton", Some("Close")); + a.role_description = Some("close button".to_string()); + a.help = Some("Close window".to_string()); + a.subrole = Some("AXCloseButton".to_string()); + a.frame_global = Some((10.0, 20.0, 30.0, 30.0)); + a.actions = vec!["AXPress".into(), "AXShowMenu".into()]; + a.focused = true; + let out = render_tree_text(&[a]); + // role_description wins over role/subrole; AXPress is filtered out + // but AXShowMenu shows up as a secondary action. + assert!(out.contains("[0] close button")); + assert!(out.contains("title=\"Close\"")); + assert!(out.contains("help=\"Close window\"")); + assert!(out.contains("frame=(10,20,30x30)")); + assert!(out.contains("[focused]")); + assert!(out.contains("actions=[AXShowMenu]")); + } + + #[test] + fn quote_clip_truncates_on_char_boundary() { + let s = "中文字符测试abcdef"; + let q = quote_clip(s, 4); + assert_eq!(q, "\"中文字符…\""); + } + + #[test] + fn digest_changes_when_a_title_changes() { + let mut a = vec![n(0, None, "AXButton", Some("Save"))]; + let d1 = compute_digest(&a); + a[0].title = Some("Saved".to_string()); + let d2 = compute_digest(&a); + assert_ne!(d1, d2); + } + + /// Smoke test: dump the AX tree of *this* test process. The test process + /// usually has no AX windows of its own, so we only assert the call + /// returns *something* (possibly an empty tree) without panicking and + /// produces a stable digest. Marked `#[ignore]` because it requires + /// Accessibility permission for `cargo test` on macOS. + #[test] + #[ignore] + fn dump_self_pid_returns_snapshot() { + let pid = std::process::id() as i32; + let snap = dump_app_ax(pid, DumpOpts::default()).expect("dump_app_ax should succeed"); + assert!(!snap.digest.is_empty(), "digest must be non-empty"); + assert_eq!(snap.app.pid, Some(pid)); + } + + #[test] + fn digest_is_stable_for_same_input() { + let nodes = vec![ + n(0, None, "AXWindow", Some("X")), + n(1, Some(0), "AXButton", Some("Y")), + ]; + assert_eq!(compute_digest(&nodes), compute_digest(&nodes)); + } +} diff --git a/src/apps/desktop/src/computer_use/macos_ax_ui.rs b/src/apps/desktop/src/computer_use/macos_ax_ui.rs index b6e51a0e5..47a43028c 100644 --- a/src/apps/desktop/src/computer_use/macos_ax_ui.rs +++ b/src/apps/desktop/src/computer_use/macos_ax_ui.rs @@ -8,7 +8,7 @@ use bitfun_core::agentic::tools::computer_use_host::{ }; use bitfun_core::util::errors::{BitFunError, BitFunResult}; use core_foundation::array::{CFArray, CFArrayRef}; -use core_foundation::base::{CFTypeRef, TCFType}; +use core_foundation::base::{CFGetTypeID, CFTypeRef, TCFType}; use core_foundation::string::{CFString, CFStringRef}; use core_graphics::geometry::{CGPoint, CGSize}; use std::collections::VecDeque; @@ -37,9 +37,12 @@ unsafe extern "C" { fn AXValueGetValue(value: AXValueRef, the_type: u32, ptr: *mut c_void) -> bool; } +type CFTypeID = usize; + #[link(name = "CoreFoundation", kind = "framework")] unsafe extern "C" { fn CFRetain(cf: CFTypeRef) -> CFTypeRef; + fn CFStringGetTypeID() -> CFTypeID; } const K_AX_VALUE_CGPOINT: u32 = 1; @@ -84,10 +87,21 @@ unsafe fn ax_copy_attr(elem: AXUIElementRef, key: &str) -> Option { Some(val) } +/// Safely convert a CF object to `String`. +/// +/// **Critical**: AX attributes like `AXValue` are polymorphic — on toggles they're a +/// `CFNumber`, on tabs an `AXUIElement`, on geometric attrs an opaque `AXValueRef`. Wrapping +/// any of those as `CFStringRef` and calling `.to_string()` dispatches `_fastCStringContents:` +/// to the wrong class, which raises an Objective-C `NSException` that unwinds across the FFI +/// boundary — Rust then aborts with `fatal runtime error: Rust cannot catch foreign exceptions`. +/// Always type-check first. unsafe fn cfstring_to_string(cf: CFTypeRef) -> Option { if cf.is_null() { return None; } + if CFGetTypeID(cf) != CFStringGetTypeID() { + return None; + } let s = CFString::wrap_under_get_rule(cf as CFStringRef); Some(s.to_string()) } @@ -155,41 +169,57 @@ unsafe fn is_ax_enabled(elem: AXUIElementRef) -> bool { enabled } -unsafe fn read_value_desc(elem: AXUIElementRef) -> (Option, Option) { - let value = ax_copy_attr(elem, "AXValue").and_then(|v| { - let s = cfstring_to_string(v); - ax_release(v); - s - }); - let desc = ax_copy_attr(elem, "AXDescription").and_then(|v| { +/// All text-bearing AX attributes a single element exposes — read in one pass so the BFS +/// body never has to choose between "fast (3 attrs)" and "complete (5 attrs)" paths. +#[derive(Debug, Default, Clone)] +pub(crate) struct NodeText { + pub role: Option, + pub subrole: Option, + pub title: Option, + pub value: Option, + pub description: Option, + pub identifier: Option, + pub help: Option, +} + +unsafe fn ax_copy_string_attr(elem: AXUIElementRef, key: &str) -> Option { + ax_copy_attr(elem, key).and_then(|v| { let s = cfstring_to_string(v); ax_release(v); s - }); - (value, desc) + }) } +pub(crate) unsafe fn read_node_text(elem: AXUIElementRef) -> NodeText { + NodeText { + role: ax_copy_string_attr(elem, "AXRole"), + subrole: ax_copy_string_attr(elem, "AXSubrole"), + title: ax_copy_string_attr(elem, "AXTitle"), + value: ax_copy_string_attr(elem, "AXValue"), + description: ax_copy_string_attr(elem, "AXDescription"), + identifier: ax_copy_string_attr(elem, "AXIdentifier"), + help: ax_copy_string_attr(elem, "AXHelp"), + } +} + +/// Legacy three-field shim used by `enumerate_ui_tree_text` and parent-context helpers; see +/// [`read_node_text`] for the full reader. unsafe fn read_role_title_id( elem: AXUIElementRef, ) -> (Option, Option, Option) { - let role = ax_copy_attr(elem, "AXRole").and_then(|v| { - let s = cfstring_to_string(v); - ax_release(v); - s - }); - let title = ax_copy_attr(elem, "AXTitle").and_then(|v| { - let s = cfstring_to_string(v); - ax_release(v); - s - }); - let ident = ax_copy_attr(elem, "AXIdentifier").and_then(|v| { - let s = cfstring_to_string(v); - ax_release(v); - s - }); + let role = ax_copy_string_attr(elem, "AXRole"); + let title = ax_copy_string_attr(elem, "AXTitle"); + let ident = ax_copy_string_attr(elem, "AXIdentifier"); (role, title, ident) } +/// Legacy two-field reader used by `enumerate_ui_tree_text`. Prefer [`read_node_text`]. +unsafe fn read_value_desc(elem: AXUIElementRef) -> (Option, Option) { + let value = ax_copy_string_attr(elem, "AXValue"); + let desc = ax_copy_string_attr(elem, "AXDescription"); + (value, desc) +} + /// Global center and axis-aligned bounds from `AXPosition` + `AXSize`. unsafe fn element_frame_global(elem: AXUIElementRef) -> Option<(f64, f64, f64, f64, f64, f64)> { let pos = ax_copy_attr(elem, "AXPosition")?; @@ -224,17 +254,24 @@ struct CandidateMatch { bounds_width: f64, bounds_height: f64, role: String, + subrole: Option, title: Option, + value: Option, + description: Option, + help: Option, identifier: Option, parent_desc: Option, depth: u32, /// Whether AXHidden is explicitly false / absent (visible). is_visible: bool, + /// Retained pointer to the matched AX node, used by climb-up to walk to a clickable ancestor. + /// Released by [`release_candidate_refs`] once ranking is done. + ax_ref: AXUIElementRef, } impl CandidateMatch { /// Higher = better. Prefer visible, reasonably-sized, shallower, on-screen elements. - fn rank_score(&self) -> i64 { + fn rank_score(&self, query: &UiElementLocateQuery) -> i64 { let mut score: i64 = 0; // Visibility is critical @@ -295,16 +332,97 @@ impl CandidateMatch { score += ((self.gy / 8.0) as i64).clamp(0, 400); } + // ── Batch 4: actionable role bias ──────────────────────────────────────────────── + // Strongly prefer truly clickable / interactive roles over pure containers. This + // is what fixes the "matched the AXStaticText inside the card, not the card + // button itself" case (the climb-up step then promotes any remaining static-text + // match to its clickable ancestor). + const ACTIONABLE_ROLES: &[&str] = &[ + "AXButton", + "AXMenuItem", + "AXMenuButton", + "AXLink", + "AXCheckBox", + "AXRadioButton", + "AXTextField", + "AXTextArea", + "AXSearchField", + "AXCell", + "AXRow", + "AXTab", + "AXPopUpButton", + "AXDisclosureTriangle", + ]; + if ACTIONABLE_ROLES.contains(&self.role.as_str()) { + score += 300; + } + const CONTAINER_ROLES: &[&str] = &[ + "AXGroup", + "AXSplitter", + "AXSplitGroup", + "AXScrollArea", + "AXLayoutArea", + "AXLayoutItem", + "AXUnknown", + "AXGenericElement", + ]; + if CONTAINER_ROLES.contains(&self.role.as_str()) { + score -= 200; + } + + // ── Batch 4: text-quality bias ─────────────────────────────────────────────────── + // When the caller used `text_contains`, exact (case-insensitive) whole-string + // matches against any text-bearing field beat substring-only matches. This is + // what lets "五子棋" prefer the card title over a paragraph that *contains* + // "五子棋" in body copy. + if let Some(ref needle) = query.text_contains { + let n = needle.trim().to_lowercase(); + if !n.is_empty() { + let fields: [&Option; 4] = + [&self.title, &self.value, &self.description, &self.help]; + let mut exact = false; + let mut substring = false; + for f in fields { + if let Some(s) = f { + let sl = s.trim().to_lowercase(); + if sl == n { + exact = true; + break; + } + if sl.contains(&n) { + substring = true; + } + } + } + if exact { + score += 150; + } else if substring { + score += 50; + } + } + } + score } fn short_description(&self) -> String { let title_str = self.title.as_deref().unwrap_or(""); let parent_str = self.parent_desc.as_deref().unwrap_or("?"); + let mut extras = String::new(); + if let Some(v) = self.value.as_deref().filter(|s| !s.is_empty()) { + extras.push_str(&format!(" value={:?}", v)); + } + if let Some(d) = self.description.as_deref().filter(|s| !s.is_empty()) { + extras.push_str(&format!(" desc={:?}", d)); + } + if let Some(sr) = self.subrole.as_deref().filter(|s| !s.is_empty()) { + extras.push_str(&format!(" subrole={}", sr)); + } format!( - "role={} title={:?} at ({:.0},{:.0}) size={:.0}x{:.0} parent=[{}]", + "role={} title={:?}{} at ({:.0},{:.0}) size={:.0}x{:.0} parent=[{}]", self.role, title_str, + extras, self.gx, self.gy, self.bounds_width, @@ -314,6 +432,76 @@ impl CandidateMatch { } } +/// Release any retained AX refs held by candidate matches (call exactly once after ranking). +fn release_candidate_refs(candidates: &mut [CandidateMatch]) { + for c in candidates.iter_mut() { + if !c.ax_ref.is_null() { + unsafe { ax_release(c.ax_ref as CFTypeRef) }; + c.ax_ref = std::ptr::null(); + } + } +} + +/// Roles that are clickable/actionable enough to be a click target. Used by climb-up. +fn is_clickable_role(role: &str) -> bool { + matches!( + role, + "AXButton" + | "AXMenuItem" + | "AXMenuButton" + | "AXLink" + | "AXCheckBox" + | "AXRadioButton" + | "AXCell" + | "AXRow" + | "AXTab" + | "AXPopUpButton" + | "AXDisclosureTriangle" + ) +} + +/// Walk up `AXParent` from `start` (retained) up to `max_steps`, returning the first ancestor +/// whose role is "clickable" (button-like / cell). Returns the retained ancestor on success. +unsafe fn climb_to_clickable_ancestor( + start: AXUIElementRef, + max_steps: u32, +) -> Option<(AXUIElementRef, NodeText, (f64, f64, f64, f64, f64, f64))> { + let mut cur = start; + let mut owns_cur = false; + for _ in 0..max_steps { + let parent_val = ax_copy_attr(cur, "AXParent"); + if owns_cur { + ax_release(cur as CFTypeRef); + } + let Some(parent_val) = parent_val else { + return None; + }; + let parent = parent_val as AXUIElementRef; + if parent.is_null() { + ax_release(parent_val); + return None; + } + // We now own `parent_val`; treat it as our retained ref. + cur = parent; + owns_cur = true; + + let nt = read_node_text(cur); + if let Some(role) = nt.role.as_deref() { + if is_clickable_role(role) { + if let Some(frame) = element_frame_global(cur) { + if frame.4 > 0.0 && frame.5 > 0.0 { + return Some((cur, nt, frame)); + } + } + } + } + } + if owns_cur { + ax_release(cur as CFTypeRef); + } + None +} + /// Check if an AX element has `AXHidden` set to true. unsafe fn is_ax_hidden(elem: AXUIElementRef) -> bool { let Some(val) = ax_copy_attr(elem, "AXHidden") else { @@ -342,6 +530,60 @@ pub fn locate_ui_element_center( query: &UiElementLocateQuery, ) -> BitFunResult { ui_locate_common::validate_query(query)?; + + // ── Batch 5: node_idx fast path ────────────────────────────────────────── + // If the caller already grabbed an `app_state` snapshot, they can pass the + // exact `node_idx` of the element they want. We resolve it via the per-pid + // cache and skip BFS entirely. `app_state_digest` (when supplied) guards + // against stale snapshots; without it we fall back to a loose lookup. + if let Some(idx) = query.node_idx { + let pid = frontmost_pid()?; + let cached = match query.app_state_digest.as_deref() { + Some(digest) => crate::computer_use::macos_ax_dump::cached_ref(pid, Some(digest), idx), + None => crate::computer_use::macos_ax_dump::cached_ref_loose(pid, idx), + }; + let ax = match cached { + Some(r) => r, + None => { + return Err(BitFunError::tool(format!( + "[AX_IDX_STALE] node_idx={} no longer present in cached app state for pid={}. \ + Re-call `desktop.get_app_state` and reuse the freshly returned idx.", + idx, pid + ))); + } + }; + let nt = unsafe { read_node_text(ax.0) }; + let frame = unsafe { element_frame_global(ax.0) }.ok_or_else(|| { + BitFunError::tool(format!( + "[AX_IDX_STALE] node_idx={} resolved but has no AXFrame (off-screen / minimised). \ + Re-call `desktop.get_app_state`.", + idx + )) + })?; + let parent_context = Some(format!( + "node_idx={} role={} title={:?}", + idx, + nt.role.as_deref().unwrap_or(""), + nt.title.as_deref().unwrap_or(""), + )); + return ui_locate_common::ok_result_with_context_full( + frame.0, + frame.1, + frame.2, + frame.3, + frame.4, + frame.5, + nt.role.unwrap_or_default(), + nt.title, + nt.identifier, + parent_context, + 1, + Vec::new(), + Some(idx), + Some("node_idx".to_string()), + ); + } + let max_depth = query.max_depth.unwrap_or(48).clamp(1, 200); let pid = frontmost_pid()?; let root = unsafe { AXUIElementCreateApplication(pid) }; @@ -381,15 +623,26 @@ pub fn locate_ui_element_center( break; } - let (role_s, title_s, id_s) = unsafe { read_role_title_id(cur.ax) }; - let role_ref = role_s.as_deref(); - let title_ref = title_s.as_deref(); - let id_ref = id_s.as_deref(); + let nt = unsafe { read_node_text(cur.ax) }; + let attrs = ui_locate_common::NodeAttrs { + role: nt.role.as_deref(), + subrole: nt.subrole.as_deref(), + title: nt.title.as_deref(), + value: nt.value.as_deref(), + description: nt.description.as_deref(), + identifier: nt.identifier.as_deref(), + help: nt.help.as_deref(), + }; - let matched = ui_locate_common::matches_filters(query, role_ref, title_ref, id_ref); + let matched = ui_locate_common::matches_filters_attrs(query, &attrs); + let mut consumed_ref = false; if matched { if let Some((gx, gy, bl, bt, bw, bh)) = unsafe { element_frame_global(cur.ax) } { let is_visible = !unsafe { is_ax_hidden(cur.ax) }; + // Retain a fresh ref for the candidate so the climb-up step can walk parents + // even after we've released our BFS-owned ref below. + let retained = unsafe { CFRetain(cur.ax as CFTypeRef) as AXUIElementRef }; + consumed_ref = !retained.is_null(); candidates.push(CandidateMatch { gx, gy, @@ -397,12 +650,21 @@ pub fn locate_ui_element_center( bounds_top: bt, bounds_width: bw, bounds_height: bh, - role: role_s.clone().unwrap_or_default(), - title: title_s.clone(), - identifier: id_s.clone(), + role: nt.role.clone().unwrap_or_default(), + subrole: nt.subrole.clone(), + title: nt.title.clone(), + value: nt.value.clone(), + description: nt.description.clone(), + help: nt.help.clone(), + identifier: nt.identifier.clone(), parent_desc: cur.parent_desc.clone(), depth: cur.depth, is_visible, + ax_ref: if consumed_ref { + retained + } else { + std::ptr::null() + }, }); // Stop collecting after MAX_CANDIDATES to avoid excessive work if candidates.len() >= MAX_CANDIDATES { @@ -418,9 +680,10 @@ pub fn locate_ui_element_center( } } } + let _ = consumed_ref; // Build description for this node to pass as parent context to children - let this_desc = element_short_desc(role_ref, title_ref); + let this_desc = element_short_desc(nt.role.as_deref(), nt.title.as_deref()); let children_ref = unsafe { ax_copy_attr(cur.ax, "AXChildren") }; let next_depth = cur.depth + 1; @@ -463,8 +726,8 @@ pub fn locate_ui_element_center( // Sort by rank score (descending); tie-break text fields toward **lower on screen** (chat input). candidates.sort_by(|a, b| { - let sa = a.rank_score(); - let sb = b.rank_score(); + let sa = a.rank_score(query); + let sb = b.rank_score(query); match sb.cmp(&sa) { std::cmp::Ordering::Equal => { let a_txt = a.role.contains("TextField") || a.role.contains("TextArea"); @@ -480,17 +743,91 @@ pub fn locate_ui_element_center( }); let total = candidates.len() as u32; - let best = &candidates[0]; + + // Pull best out so we can mutate it (climb-up replaces frame in-place). + let mut best = candidates.remove(0); + + // ── Batch 4: climb-up from AXStaticText to clickable ancestor ──────────────────────── + // If the highest-ranked match is a static-text leaf inside a button/cell, the user + // almost certainly wants to click the wrapping container (e.g. the "五子棋" card), + // not the text glyph. Walk parents up to 6 hops looking for a clickable role. + let mut climbed_from: Option = None; + let area = best.bounds_width * best.bounds_height; + if best.role == "AXStaticText" && area > 0.0 && area < 1500.0 && !best.ax_ref.is_null() { + let original_text = best + .title + .clone() + .or_else(|| best.value.clone()) + .or_else(|| best.description.clone()) + .unwrap_or_else(|| "".to_string()); + // Take the candidate's retained ref; climb_to_clickable_ancestor consumes it. + let leaf_ref = best.ax_ref; + best.ax_ref = std::ptr::null(); + if let Some((ancestor_ref, ancestor_nt, ancestor_frame)) = + unsafe { climb_to_clickable_ancestor(leaf_ref, 6) } + { + best.gx = ancestor_frame.0; + best.gy = ancestor_frame.1; + best.bounds_left = ancestor_frame.2; + best.bounds_top = ancestor_frame.3; + best.bounds_width = ancestor_frame.4; + best.bounds_height = ancestor_frame.5; + best.role = ancestor_nt.role.clone().unwrap_or_default(); + best.subrole = ancestor_nt.subrole.clone(); + // Preserve the matched text in `title` slot for visibility, but record where it came from. + if best.title.is_none() { + best.title = ancestor_nt.title.clone(); + } + best.identifier = ancestor_nt.identifier.clone().or(best.identifier.clone()); + climbed_from = Some(original_text); + unsafe { ax_release(ancestor_ref as CFTypeRef) }; + } else { + // Climb failed — leaf stays as the result; release nothing extra (leaf_ref already consumed). + } + } // Build "other matches" summaries for the model to see alternatives let other_matches: Vec = candidates .iter() - .skip(1) .take(4) .map(|c| c.short_description()) .collect(); - ui_locate_common::ok_result_with_context( + // Choose `matched_via` based on which filter actually contributed to the win. + let matched_via = if query.text_contains.is_some() { + Some("text_contains".to_string()) + } else if query.title_contains.is_some() { + Some("title_contains".to_string()) + } else if query.role_substring.is_some() { + Some("role_substring".to_string()) + } else if query.identifier_contains.is_some() { + Some("identifier_contains".to_string()) + } else { + None + }; + let matched_via = match (matched_via, climbed_from.as_ref()) { + (Some(v), Some(_)) => Some(format!("climbed:{}", v)), + (Some(v), None) => Some(v), + (None, Some(_)) => Some("climbed".to_string()), + (None, None) => None, + }; + let parent_context = match climbed_from { + Some(text) => Some(format!( + "{} (climbed from AXStaticText {:?})", + best.parent_desc.as_deref().unwrap_or("?"), + text, + )), + None => best.parent_desc.clone(), + }; + + // Release the best candidate's retained ref (if any) and any remaining candidate refs. + if !best.ax_ref.is_null() { + unsafe { ax_release(best.ax_ref as CFTypeRef) }; + best.ax_ref = std::ptr::null(); + } + release_candidate_refs(&mut candidates); + + ui_locate_common::ok_result_with_context_full( best.gx, best.gy, best.bounds_left, @@ -500,9 +837,11 @@ pub fn locate_ui_element_center( best.role.clone(), best.title.clone(), best.identifier.clone(), - best.parent_desc.clone(), + parent_context, total, other_matches, + None, + matched_via, ) } @@ -758,10 +1097,15 @@ pub fn accessibility_hit_at_global_point(gx: f64, gy: f64) -> Option BitFunResult<(i32, i32, u32, u32)> { let pid = frontmost_pid()?; + window_bounds_global_for_pid(pid) +} + +/// Bounds of the selected app's focused or main window in global screen coordinates. +pub fn window_bounds_global_for_pid(pid: i32) -> BitFunResult<(i32, i32, u32, u32)> { let app = unsafe { AXUIElementCreateApplication(pid) }; if app.is_null() { return Err(BitFunError::tool( - "AXUIElementCreateApplication returned null for OCR window bounds.".to_string(), + "AXUIElementCreateApplication returned null for window bounds.".to_string(), )); } unsafe { @@ -769,19 +1113,19 @@ pub fn frontmost_window_bounds_global() -> BitFunResult<(i32, i32, u32, u32)> { ax_release(app as CFTypeRef); let Some(win) = win else { return Err(BitFunError::tool( - "No AX window for foreground app (try AXFocusedWindow / AXMainWindow / AXWindows)." + "No AX window for target app (try AXFocusedWindow / AXMainWindow / AXWindows)." .to_string(), )); }; let frame = element_frame_global(win).ok_or_else(|| { ax_release(win as CFTypeRef); - BitFunError::tool("Could not read AXPosition/AXSize for foreground window.".to_string()) + BitFunError::tool("Could not read AXPosition/AXSize for target window.".to_string()) })?; ax_release(win as CFTypeRef); let (_, _, bl, bt, bw, bh) = frame; if bw < 1.0 || bh < 1.0 { return Err(BitFunError::tool( - "Foreground window has invalid size for OCR.".to_string(), + "Target window has invalid size for screenshot.".to_string(), )); } let x0 = bl.floor() as i32; diff --git a/src/apps/desktop/src/computer_use/macos_ax_write.rs b/src/apps/desktop/src/computer_use/macos_ax_write.rs new file mode 100644 index 000000000..f27ed3416 --- /dev/null +++ b/src/apps/desktop/src/computer_use/macos_ax_write.rs @@ -0,0 +1,126 @@ +//! AX-first writers: prefer `AXUIElementPerformAction` / +//! `AXUIElementSetAttributeValue` over synthetic `CGEvent` injection. +//! +//! The dispatch layer's contract: +//! 1. Resolve `(pid, idx)` to a live `AxRef` via `macos_ax_dump::cached_ref`. +//! 2. Try the AX path here. On success: zero foreground impact, no event +//! taps fired, accessibility services see a real semantic action. +//! 3. On failure (`Err(AxWriteUnavailable)`): the dispatch layer falls back +//! to `macos_bg_input` (background `CGEvent` injection to the pid). +//! +//! This mirrors Codex: AX-first for correctness + speed, event-fallback for +//! pathological apps that refuse `AXPress` / `AXSetValue`. + +#![allow(dead_code)] + +use crate::computer_use::macos_ax_dump::AxRef; +use core_foundation::base::{CFTypeRef, TCFType}; +use core_foundation::string::{CFString, CFStringRef}; + +type AXUIElementRef = *const std::ffi::c_void; + +#[link(name = "ApplicationServices", kind = "framework")] +unsafe extern "C" { + fn AXUIElementPerformAction(element: AXUIElementRef, action: CFStringRef) -> i32; + fn AXUIElementSetAttributeValue( + element: AXUIElementRef, + attribute: CFStringRef, + value: CFTypeRef, + ) -> i32; +} + +/// Result of an AX-first attempt. +#[derive(Debug)] +pub enum AxWriteOutcome { + /// The AX call succeeded — no fallback needed. + Ok, + /// AX rejected the call (status non-zero or unsupported). Caller should + /// fall through to event injection. + Unavailable(i32), +} + +/// Try to "click" via AXPress. Most controls (NSButton, links, menu items) +/// implement this; many text fields and webviews do not. +pub fn try_ax_press(target: AxRef) -> AxWriteOutcome { + if target.0.is_null() { + return AxWriteOutcome::Unavailable(-1); + } + let action = CFString::new("AXPress"); + let st = unsafe { AXUIElementPerformAction(target.0, action.as_concrete_TypeRef()) }; + if st == 0 { + AxWriteOutcome::Ok + } else { + AxWriteOutcome::Unavailable(st) + } +} + +/// Try to set the AXValue of a text field. `value` is sent as a CFString. +/// Caller is responsible for any subsequent focus / commit (Tab, Return). +pub fn try_ax_set_value(target: AxRef, value: &str) -> AxWriteOutcome { + if target.0.is_null() { + return AxWriteOutcome::Unavailable(-1); + } + let attr = CFString::new("AXValue"); + let v = CFString::new(value); + let st = unsafe { + AXUIElementSetAttributeValue( + target.0, + attr.as_concrete_TypeRef(), + v.as_concrete_TypeRef() as CFTypeRef, + ) + }; + if st == 0 { + AxWriteOutcome::Ok + } else { + AxWriteOutcome::Unavailable(st) + } +} + +/// Try a generic AX action by name (e.g. `"AXShowMenu"`, `"AXIncrement"`). +pub fn try_ax_action(target: AxRef, action_name: &str) -> AxWriteOutcome { + if target.0.is_null() { + return AxWriteOutcome::Unavailable(-1); + } + let a = CFString::new(action_name); + let st = unsafe { AXUIElementPerformAction(target.0, a.as_concrete_TypeRef()) }; + if st == 0 { + AxWriteOutcome::Ok + } else { + AxWriteOutcome::Unavailable(st) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + /// Null AX refs must short-circuit to `Unavailable(-1)` so the dispatch + /// layer falls back to event injection instead of dereferencing a null + /// pointer in the AX framework. + #[test] + fn null_ref_press_returns_unavailable() { + let r = AxRef(std::ptr::null()); + match try_ax_press(r) { + AxWriteOutcome::Unavailable(-1) => {} + other => panic!("expected Unavailable(-1), got {:?}", other), + } + } + + #[test] + fn null_ref_set_value_returns_unavailable() { + let r = AxRef(std::ptr::null()); + match try_ax_set_value(r, "hello") { + AxWriteOutcome::Unavailable(-1) => {} + other => panic!("expected Unavailable(-1), got {:?}", other), + } + } + + #[test] + fn null_ref_action_returns_unavailable() { + let r = AxRef(std::ptr::null()); + match try_ax_action(r, "AXShowMenu") { + AxWriteOutcome::Unavailable(-1) => {} + other => panic!("expected Unavailable(-1), got {:?}", other), + } + } +} diff --git a/src/apps/desktop/src/computer_use/macos_bg_input.rs b/src/apps/desktop/src/computer_use/macos_bg_input.rs new file mode 100644 index 000000000..054d04d78 --- /dev/null +++ b/src/apps/desktop/src/computer_use/macos_bg_input.rs @@ -0,0 +1,642 @@ +//! Codex-style background input injection for macOS. +//! +//! Wraps `CGEventCreate*` + `CGEventSourceStateID::Private` + +//! `CGEventPostToPid` so we can drive a *specific* application without +//! * moving the user's mouse cursor, +//! * stealing the user's keyboard focus, +//! * or polluting the global HID event stream with our synthesized +//! modifier presses (the `Private` source is decoupled from the user's +//! real keyboard latch state). +//! +//! Used by the AX-first dispatch path in ControlHub: when an `app_*` action +//! cannot be satisfied by `AXUIElementPerformAction` alone (e.g. scroll, +//! free-form typing, complex chords) we fall back to PID-targeted events +//! from this module instead of the global foreground click path. +//! +//! Wired up by the next todos (`macos-ax-write` + `controlhub-actions`); +//! kept as standalone helpers here so it can be unit-tested and audited +//! independently of the dispatch glue. + +#![allow(dead_code)] + +use bitfun_core::util::errors::{BitFunError, BitFunResult}; +use core_graphics::event::{CGEvent, CGEventFlags, CGEventType, CGMouseButton, ScrollEventUnit}; +use core_graphics::event_source::{CGEventSource, CGEventSourceStateID}; +use core_graphics::geometry::CGPoint; +use std::thread; +use std::time::{Duration, Instant}; +use log::{debug, info, warn}; + +/// Logical mouse button for `bg_click`. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum BgMouseButton { + Left, + Right, + Middle, +} + +impl BgMouseButton { + fn cg(self) -> CGMouseButton { + match self { + Self::Left => CGMouseButton::Left, + Self::Right => CGMouseButton::Right, + Self::Middle => CGMouseButton::Center, + } + } + fn down(self) -> CGEventType { + match self { + Self::Left => CGEventType::LeftMouseDown, + Self::Right => CGEventType::RightMouseDown, + Self::Middle => CGEventType::OtherMouseDown, + } + } + fn up(self) -> CGEventType { + match self { + Self::Left => CGEventType::LeftMouseUp, + Self::Right => CGEventType::RightMouseUp, + Self::Middle => CGEventType::OtherMouseUp, + } + } +} + +/// Modifier keys understood by `bg_key_chord` / mouse modifiers. +/// +/// Maps to the 4 standard macOS modifier flag bits. We deliberately do not +/// touch `CapsLock` here. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum BgModifier { + Command, + Shift, + Option, // alias: alt + Control, +} + +impl BgModifier { + pub fn from_str(s: &str) -> Option { + match s.to_ascii_lowercase().as_str() { + "cmd" | "command" | "meta" | "super" => Some(Self::Command), + "shift" => Some(Self::Shift), + "alt" | "option" | "opt" => Some(Self::Option), + "ctrl" | "control" => Some(Self::Control), + _ => None, + } + } + fn flag(self) -> CGEventFlags { + match self { + Self::Command => CGEventFlags::CGEventFlagCommand, + Self::Shift => CGEventFlags::CGEventFlagShift, + Self::Option => CGEventFlags::CGEventFlagAlternate, + Self::Control => CGEventFlags::CGEventFlagControl, + } + } + fn keycode(self) -> u16 { + match self { + Self::Command => 55, + Self::Shift => 56, + Self::Option => 58, + Self::Control => 59, + } + } +} + +/// Whether this host can deliver background input to arbitrary pids. +/// +/// Both `CGEventSourceStateID::Private` and `CGEventPostToPid` require the +/// macOS Accessibility privilege to be granted to the *host* process; if it +/// is not, the calls are silently dropped by the kernel. Callers should +/// surface `BACKGROUND_INPUT_UNAVAILABLE` upstream when this returns +/// `false`. +/// +/// Result is cached after the first successful probe so we don't pay the +/// `CGEventSource` create + `CGEventPostToPid` round-trip on every call. +/// A `false` result is NOT cached so callers can re-probe after the user +/// grants Accessibility permission without restarting the host. +pub fn supports_background_input() -> bool { + use std::sync::atomic::{AtomicBool, Ordering}; + static CACHED_OK: AtomicBool = AtomicBool::new(false); + if CACHED_OK.load(Ordering::Relaxed) { + return true; + } + if !accessibility_is_trusted() { + return false; + } + // Real Codex-style probe: build a private source and post a no-op scroll + // to *our own* pid. Posting to self never disturbs the user's foreground + // app or real cursor, but it round-trips through the same kernel path + // that would deliver to a third-party pid. + let probe_ok = (|| -> bool { + let src = match CGEventSource::new(CGEventSourceStateID::Private) { + Ok(s) => s, + Err(_) => return false, + }; + let ev = match CGEvent::new_scroll_event(src, ScrollEventUnit::PIXEL, 2, 0, 0, 0) { + Ok(e) => e, + Err(_) => return false, + }; + let me = std::process::id() as i32; + ev.post_to_pid(me); + true + })(); + if probe_ok { + CACHED_OK.store(true, Ordering::Relaxed); + } + probe_ok +} + +/// Best-effort check for "host has been granted Accessibility access". +/// We re-implement it locally rather than depending on the +/// `permissions::accessibility` module so this file stays unit-testable +/// outside the broader desktop app. +fn accessibility_is_trusted() -> bool { + // Re-declared with the same loosely-typed signature used elsewhere in + // this crate (`desktop_host.rs`) to avoid a clashing-extern warning. + unsafe extern "C" { + fn AXIsProcessTrustedWithOptions(options: *const std::ffi::c_void) -> bool; + } + // We pass NULL options so we never auto-prompt the user — explicit + // permission-prompting lives in the existing `permissions` module. + unsafe { AXIsProcessTrustedWithOptions(std::ptr::null()) } +} + +fn private_source(label: &str) -> BitFunResult { + CGEventSource::new(CGEventSourceStateID::Private) + .map_err(|_| BitFunError::tool(format!("CGEventSource::Private failed ({})", label))) +} + +/// Compose modifier flags for a chord. +fn flags_from(mods: &[BgModifier]) -> CGEventFlags { + mods.iter() + .fold(CGEventFlags::CGEventFlagNull, |acc, m| acc | m.flag()) +} + +/// Send a click (down + up, possibly multi-click) at the given **global** +/// pointer position to the target pid. The user's real cursor is NOT moved +/// because we never call `CGWarpMouseCursorPosition` and the synthesized +/// event's `MouseMoved` predecessor is also pid-scoped. +/// +/// `point` is in Quartz global pointer coordinates (origin top-left of main +/// display, same space as the existing screenshot pipeline). +pub fn bg_click( + pid: i32, + point: (f64, f64), + button: BgMouseButton, + click_count: u32, + modifiers: &[BgModifier], +) -> BitFunResult<()> { + if click_count == 0 { + return Ok(()); + } + let pt = CGPoint { + x: point.0, + y: point.1, + }; + let flags = flags_from(modifiers); + let self_pid = std::process::id() as i32; + let frontmost = frontmost_pid_macos(); + let started = Instant::now(); + info!( + target: "computer_use::bg_input", + "bg_click.enter pid={} self_pid={} same_process={} frontmost_pid={:?} is_frontmost={} x={:.2} y={:.2} button={:?} click_count={} modifiers={:?}", + pid, + self_pid, + pid == self_pid, + frontmost, + Some(pid) == frontmost, + point.0, + point.1, + button, + click_count, + modifiers + ); + // Codex parity: a *single* `CGEventSource` is shared across the whole + // gesture so the kernel-side modifier latch state stays consistent + // between MouseMoved / Down / Up. Allocating a fresh source per event + // (the previous shape) caused some Cocoa apps (notably Chromium-based + // webviews and SwiftUI text fields) to drop modifier flags between the + // down and up events and either select text or miss the chord entirely. + let src = match private_source("click") { + Ok(s) => s, + Err(e) => { + warn!(target: "computer_use::bg_input", "bg_click.private_source_failed pid={} error={}", pid, e); + return Err(e); + } + }; + + // Pre-position the synthetic pointer inside the app's event queue so AX + // hit-testing in the target app sees the right coordinates. Does NOT + // move the user's real cursor because we post pid-scoped, not global. + let mv = CGEvent::new_mouse_event(src.clone(), CGEventType::MouseMoved, pt, button.cg()) + .map_err(|_| BitFunError::tool("CGEvent MouseMoved failed".to_string()))?; + if !flags.is_empty() { + mv.set_flags(flags); + } + mv.post_to_pid(pid); + + for i in 1..=click_count { + let down = CGEvent::new_mouse_event(src.clone(), button.down(), pt, button.cg()) + .map_err(|_| BitFunError::tool("CGEvent MouseDown failed".to_string()))?; + // Click count field lets the target app recognise double / triple + // clicks within its own quench-time window. + down.set_integer_value_field( + core_graphics::event::EventField::MOUSE_EVENT_CLICK_STATE, + i as i64, + ); + if !flags.is_empty() { + down.set_flags(flags); + } + down.post_to_pid(pid); + + let up = CGEvent::new_mouse_event(src.clone(), button.up(), pt, button.cg()) + .map_err(|_| BitFunError::tool("CGEvent MouseUp failed".to_string()))?; + up.set_integer_value_field( + core_graphics::event::EventField::MOUSE_EVENT_CLICK_STATE, + i as i64, + ); + if !flags.is_empty() { + up.set_flags(flags); + } + up.post_to_pid(pid); + } + info!( + target: "computer_use::bg_input", + "bg_click.posted pid={} elapsed_ms={}", + pid, + started.elapsed().as_millis() as u64 + ); + Ok(()) +} + +/// Best-effort lookup of the macOS frontmost-application pid via NSWorkspace. +/// Returns `None` when the AppKit lookup is not available (e.g. headless tests +/// or non-main-thread contexts where we don't want to assert). +fn frontmost_pid_macos() -> Option { + use objc2::msg_send; + use objc2::runtime::AnyObject; + unsafe { + let cls = objc2::runtime::AnyClass::get(c"NSWorkspace")?; + let ws: *mut AnyObject = msg_send![cls, sharedWorkspace]; + if ws.is_null() { + return None; + } + let app: *mut AnyObject = msg_send![ws, frontmostApplication]; + if app.is_null() { + return None; + } + let pid: i32 = msg_send![app, processIdentifier]; + if pid <= 0 { None } else { Some(pid) } + } +} + +/// Best-effort: bring `pid`'s app to the foreground so that GUI hit-testing +/// (especially WKWebView event delivery) reliably routes synthetic clicks +/// to the right window. Uses the public NSRunningApplication API. +/// +/// Returns `Ok(true)` when the activation call returned success, `Ok(false)` +/// when the app could not be found, and `Err(_)` on AppKit FFI failures. +pub fn activate_pid_macos(pid: i32) -> BitFunResult { + use objc2::msg_send; + use objc2::runtime::AnyObject; + let started = Instant::now(); + let result: bool = unsafe { + let cls = match objc2::runtime::AnyClass::get(c"NSRunningApplication") { + Some(c) => c, + None => { + debug!(target: "computer_use::bg_input", "activate.class_missing pid={}", pid); + return Ok(false); + } + }; + let app: *mut AnyObject = msg_send![cls, runningApplicationWithProcessIdentifier: pid]; + if app.is_null() { + debug!(target: "computer_use::bg_input", "activate.app_not_found pid={}", pid); + return Ok(false); + } + // 1<<1 == NSApplicationActivateIgnoringOtherApps + let ok: bool = msg_send![app, activateWithOptions: 1u64 << 1]; + ok + }; + info!( + target: "computer_use::bg_input", + "activate.done pid={} ok={} elapsed_ms={}", + pid, + result, + started.elapsed().as_millis() as u64 + ); + Ok(result) +} + +/// Pixel-delta scroll inside the focused scroll container of the target +/// pid's frontmost window. Positive `dy` scrolls content down (matches +/// trackpad / `wheel1>0` direction). +pub fn bg_scroll(pid: i32, dx: i32, dy: i32) -> BitFunResult<()> { + info!( + target: "computer_use::bg_input", + "bg_scroll.enter pid={} dx={} dy={}", + pid, dx, dy + ); + let src = private_source("scroll")?; + // Two-axis pixel scroll (`wheelCount = 2`): wheel1 = dy, wheel2 = dx. + // Sign convention matches the system trackpad (positive dy = content + // moves down on screen, i.e. user is looking further into the document). + let ev = CGEvent::new_scroll_event(src, ScrollEventUnit::PIXEL, 2, dy, dx, 0) + .map_err(|_| BitFunError::tool("CGEventCreateScrollWheelEvent2 failed".to_string()))?; + ev.post_to_pid(pid); + Ok(()) +} + +/// Type a UTF-8 string into the focused control of the target pid using the +/// `kCGEventKeyboardEventUnicodeString` field. This bypasses keymap +/// translation entirely, so it correctly handles emoji, CJK and other +/// non-Latin input without touching the system IME. +pub fn bg_type_text(pid: i32, text: &str) -> BitFunResult<()> { + if text.is_empty() { + return Ok(()); + } + info!( + target: "computer_use::bg_input", + "bg_type_text.enter pid={} char_count={} byte_count={}", + pid, + text.chars().count(), + text.len() + ); + // Single source for the whole string (Codex parity): keeps the kernel + // keyboard state coherent and avoids the per-char allocation cost. + let src = private_source("type_text")?; + // We send one event per Unicode scalar to keep individual events small + // and let the target app receive a sane stream of `keyDown` callbacks. + // (`set_string` itself will accept a longer buffer, but some Cocoa text + // controls truncate at ~20 UTF-16 units per event.) + for ch in text.chars() { + // Keycode 0 is irrelevant when the unicode string field is set. + let ev = CGEvent::new_keyboard_event(src.clone(), 0, true) + .map_err(|_| BitFunError::tool("CGEventCreateKeyboardEvent failed".to_string()))?; + let buf: Vec = ch.encode_utf16(&mut [0u16; 2]).to_vec(); + ev.set_string_from_utf16_unchecked(&buf); + ev.post_to_pid(pid); + // Match keyup so the target app sees a complete keystroke. + let ev2 = CGEvent::new_keyboard_event(src.clone(), 0, false) + .map_err(|_| BitFunError::tool("CGEventCreateKeyboardEvent (up) failed".to_string()))?; + ev2.set_string_from_utf16_unchecked(&buf); + ev2.post_to_pid(pid); + // 8ms inter-key gap matches Codex / native typing rates and avoids + // dropped chars in Chromium webviews and SwiftUI multi-line fields + // that throttle their keystroke handler. 1ms (the previous value) + // was reliably losing ~5–10% of CJK glyphs in informal smoke tests. + thread::sleep(Duration::from_millis(8)); + } + Ok(()) +} + +/// Send a key chord (modifier+key combo) to the target pid using the +/// private event source. `key` is the AX / Carbon virtual keycode; callers +/// can use `keycode_for_char` for ASCII letters or pass a literal keycode. +pub fn bg_key_chord(pid: i32, modifiers: &[BgModifier], key: u16) -> BitFunResult<()> { + info!( + target: "computer_use::bg_input", + "bg_key_chord.enter pid={} keycode={} modifiers={:?}", + pid, key, modifiers + ); + let flags = flags_from(modifiers); + // Single source across the whole chord — required for the modifier + // latch state to survive between mod_down → key_down → key_up → mod_up. + let src = private_source("key_chord")?; + + // Press modifiers. + for m in modifiers { + let ev = CGEvent::new_keyboard_event(src.clone(), m.keycode(), true) + .map_err(|_| BitFunError::tool("CGEvent ModDown failed".to_string()))?; + ev.set_flags(flags); + ev.post_to_pid(pid); + } + // Press main key. + { + let ev = CGEvent::new_keyboard_event(src.clone(), key, true) + .map_err(|_| BitFunError::tool("CGEvent KeyDown failed".to_string()))?; + ev.set_flags(flags); + ev.post_to_pid(pid); + } + { + let ev = CGEvent::new_keyboard_event(src.clone(), key, false) + .map_err(|_| BitFunError::tool("CGEvent KeyUp failed".to_string()))?; + ev.set_flags(flags); + ev.post_to_pid(pid); + } + // Release modifiers in reverse press order. + for m in modifiers.iter().rev() { + let ev = CGEvent::new_keyboard_event(src.clone(), m.keycode(), false) + .map_err(|_| BitFunError::tool("CGEvent ModUp failed".to_string()))?; + // Drop this modifier from the flag set as we release it. + let remaining = modifiers + .iter() + .copied() + .filter(|x| x != m) + .collect::>(); + ev.set_flags(flags_from(&remaining)); + ev.post_to_pid(pid); + } + Ok(()) +} + +/// Parse a key spec the dispatch layer might pass us, of the form +/// `"command+shift+p"` / `"return"` / `"escape"` / `"a"`. Returns the +/// modifier list and the resolved keycode. +pub fn parse_key_spec(spec: &str) -> BitFunResult<(Vec, u16)> { + let mut mods = Vec::new(); + let parts: Vec<&str> = spec.split('+').map(str::trim).collect(); + if parts.is_empty() { + return Err(BitFunError::tool("empty key spec".to_string())); + } + let (last, head) = parts.split_last().unwrap(); + for p in head { + let m = BgModifier::from_str(p) + .ok_or_else(|| BitFunError::tool(format!("unknown modifier in key spec: {}", p)))?; + mods.push(m); + } + let kc = keycode_for_named(last) + .or_else(|| { + // Single-char ASCII fallback. + let mut chars = last.chars(); + let c = chars.next()?; + if chars.next().is_some() { + return None; + } + keycode_for_char(c) + }) + .ok_or_else(|| BitFunError::tool(format!("unknown key in key spec: {}", last)))?; + Ok((mods, kc)) +} + +/// Parse the ControlHub/Codex chord shape: `["command", "shift", "p"]`, +/// `["command+shift+p"]`, or `["return"]`. +pub fn parse_key_sequence(keys: &[String]) -> BitFunResult<(Vec, u16)> { + if keys.is_empty() { + return Err(BitFunError::tool("empty key sequence".to_string())); + } + if keys.len() == 1 { + return parse_key_spec(&keys[0]); + } + + let (last, head) = keys.split_last().unwrap(); + let mut mods = Vec::with_capacity(head.len()); + for p in head { + let m = BgModifier::from_str(p) + .ok_or_else(|| BitFunError::tool(format!("unknown modifier in key sequence: {}", p)))?; + mods.push(m); + } + let kc = keycode_for_named(last) + .or_else(|| { + let mut chars = last.chars(); + let c = chars.next()?; + if chars.next().is_some() { + return None; + } + keycode_for_char(c) + }) + .ok_or_else(|| BitFunError::tool(format!("unknown key in key sequence: {}", last)))?; + Ok((mods, kc)) +} + +/// Map common named keys (Codex parity) to AX / Carbon keycodes. +pub fn keycode_for_named(name: &str) -> Option { + Some(match name.to_ascii_lowercase().as_str() { + "return" | "enter" => 36, + "tab" => 48, + "space" => 49, + "delete" | "backspace" => 51, + "escape" | "esc" => 53, + "left" => 123, + "right" => 124, + "down" => 125, + "up" => 126, + "home" => 115, + "end" => 119, + "pageup" | "page_up" => 116, + "pagedown" | "page_down" => 121, + "f1" => 122, + "f2" => 120, + "f3" => 99, + "f4" => 118, + "f5" => 96, + "f6" => 97, + "f7" => 98, + "f8" => 100, + "f9" => 101, + "f10" => 109, + "f11" => 103, + "f12" => 111, + _ => return None, + }) +} + +/// Map a single ASCII character to the **US-keyboard** keycode. This is the +/// same table Codex / enigo use; the user's actual keymap is irrelevant for +/// our chord injection because we set explicit modifier flags ourselves. +pub fn keycode_for_char(c: char) -> Option { + let upper = c.to_ascii_uppercase(); + Some(match upper { + 'A' => 0, + 'S' => 1, + 'D' => 2, + 'F' => 3, + 'H' => 4, + 'G' => 5, + 'Z' => 6, + 'X' => 7, + 'C' => 8, + 'V' => 9, + 'B' => 11, + 'Q' => 12, + 'W' => 13, + 'E' => 14, + 'R' => 15, + 'Y' => 16, + 'T' => 17, + '1' => 18, + '2' => 19, + '3' => 20, + '4' => 21, + '6' => 22, + '5' => 23, + '=' => 24, + '9' => 25, + '7' => 26, + '-' => 27, + '8' => 28, + '0' => 29, + ']' => 30, + 'O' => 31, + 'U' => 32, + '[' => 33, + 'I' => 34, + 'P' => 35, + 'L' => 37, + 'J' => 38, + '\'' => 39, + 'K' => 40, + ';' => 41, + '\\' => 42, + ',' => 43, + '/' => 44, + 'N' => 45, + 'M' => 46, + '.' => 47, + '`' => 50, + _ => return None, + }) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parse_key_spec_command_shift_p() { + let (mods, key) = parse_key_spec("command+shift+p").unwrap(); + assert_eq!(mods, vec![BgModifier::Command, BgModifier::Shift]); + assert_eq!(key, 35); + } + + #[test] + fn parse_key_spec_named_return() { + let (mods, key) = parse_key_spec("return").unwrap(); + assert!(mods.is_empty()); + assert_eq!(key, 36); + } + + #[test] + fn parse_key_spec_aliases() { + let (mods, _) = parse_key_spec("cmd+opt+a").unwrap(); + assert_eq!(mods, vec![BgModifier::Command, BgModifier::Option]); + } + + #[test] + fn parse_key_sequence_array_chord() { + let keys = vec!["command".to_string(), "shift".to_string(), "p".to_string()]; + let (mods, key) = parse_key_sequence(&keys).unwrap(); + assert_eq!(mods, vec![BgModifier::Command, BgModifier::Shift]); + assert_eq!(key, 35); + } + + #[test] + fn parse_key_sequence_single_plus_spec() { + let keys = vec!["command+f".to_string()]; + let (mods, key) = parse_key_sequence(&keys).unwrap(); + assert_eq!(mods, vec![BgModifier::Command]); + assert_eq!(key, 3); + } + + #[test] + fn modifier_from_str_aliases() { + assert_eq!(BgModifier::from_str("CMD"), Some(BgModifier::Command)); + assert_eq!(BgModifier::from_str("control"), Some(BgModifier::Control)); + assert_eq!(BgModifier::from_str("alt"), Some(BgModifier::Option)); + assert_eq!(BgModifier::from_str("zzz"), None); + } + + #[test] + fn flags_from_combines() { + let f = flags_from(&[BgModifier::Command, BgModifier::Shift]); + assert!(f.contains(CGEventFlags::CGEventFlagCommand)); + assert!(f.contains(CGEventFlags::CGEventFlagShift)); + assert!(!f.contains(CGEventFlags::CGEventFlagControl)); + } +} diff --git a/src/apps/desktop/src/computer_use/macos_list_apps.rs b/src/apps/desktop/src/computer_use/macos_list_apps.rs new file mode 100644 index 000000000..3cc38f31f --- /dev/null +++ b/src/apps/desktop/src/computer_use/macos_list_apps.rs @@ -0,0 +1,134 @@ +//! Enumerate currently running GUI applications on macOS. +//! +//! We use AppleScript via `osascript` to read `System Events` — +//! pragmatically the same data NSWorkspace.runningApplications exposes, +//! without requiring a full objc/cocoa binding stack here. This is "good +//! enough" for the AX-first plan: the list is used to resolve +//! `AppSelector::ByName` / `ByBundleId` to a pid, after which all real work +//! happens through AX + bg-input. +//! +//! Last-used / launch-count signals from LaunchServices are not available +//! through AppleScript; we expose `last_used_at_ms = 0` and +//! `launch_count = 0` so the trait shape is preserved. A future enhancement +//! can swap this out for a real NSWorkspace + LSSharedFileList implementation +//! without changing callers. + +#![allow(dead_code)] + +use bitfun_core::agentic::tools::computer_use_host::AppInfo; +use bitfun_core::util::errors::{BitFunError, BitFunResult}; +use std::sync::Mutex; +use std::time::{Duration, Instant}; + +/// Short-lived cache for `list_running_apps` results. +/// +/// `osascript` cold-start costs ~150–250ms on a quiet machine. The AX-first +/// dispatch path resolves an `AppSelector → pid` *before every* `app_*` +/// action, so without caching every click would pay this latency twice +/// (once for the action, once for the post-action re-snapshot). A 5-second +/// TTL is short enough that newly-launched apps appear quickly while +/// eliminating the back-to-back duplicate calls inside one agent step. +static CACHE: Mutex)>> = Mutex::new(None); +const CACHE_TTL: Duration = Duration::from_secs(5); + +const ASCRIPT: &str = r#" +set out to "" +tell application "System Events" + set procs to (every application process whose background only is false) + repeat with p in procs + try + set bid to bundle identifier of p + on error + set bid to "" + end try + try + set pname to name of p + on error + set pname to "" + end try + try + set ppid to unix id of p + on error + set ppid to 0 + end try + try + set ph to (visible of p as string) + on error + set ph to "true" + end try + set out to out & pname & "\t" & bid & "\t" & ppid & "\t" & ph & "\n" + end repeat +end tell +return out +"#; + +pub fn list_running_apps(include_hidden: bool) -> BitFunResult> { + if let Ok(guard) = CACHE.lock() { + if let Some((ts, cached_hidden, ref apps)) = *guard { + if cached_hidden == include_hidden && ts.elapsed() < CACHE_TTL { + return Ok(apps.clone()); + } + } + } + let out = std::process::Command::new("/usr/bin/osascript") + .arg("-e") + .arg(ASCRIPT) + .output() + .map_err(|e| BitFunError::tool(format!("osascript spawn: {}", e)))?; + if !out.status.success() { + return Err(BitFunError::tool(format!( + "osascript list_apps failed: {}", + String::from_utf8_lossy(&out.stderr) + ))); + } + let body = String::from_utf8_lossy(&out.stdout); + let mut apps = Vec::new(); + for line in body.lines() { + let parts: Vec<&str> = line.split('\t').collect(); + if parts.len() < 4 { + continue; + } + let name = parts[0].trim().to_string(); + let bundle_id = { + let s = parts[1].trim(); + if s.is_empty() { + None + } else { + Some(s.to_string()) + } + }; + let pid: i32 = parts[2].trim().parse().unwrap_or(0); + let visible = parts[3].trim().eq_ignore_ascii_case("true"); + if name.is_empty() || pid <= 0 { + continue; + } + if !include_hidden && !visible { + continue; + } + apps.push(AppInfo { + name, + bundle_id, + pid: Some(pid), + running: true, + last_used_ms: None, + launch_count: 0, + }); + } + // Best-effort stable order: alphabetical by name. The richer + // "recently used / most launched" sort is left to a future + // LaunchServices-backed implementation. + apps.sort_by(|a, b| a.name.to_lowercase().cmp(&b.name.to_lowercase())); + if let Ok(mut guard) = CACHE.lock() { + *guard = Some((Instant::now(), include_hidden, apps.clone())); + } + Ok(apps) +} + +/// Drop the cached `list_running_apps` result so the next call re-probes +/// `osascript`. Used when the agent has just launched / quit an app and +/// needs the freshest pid set. +pub fn invalidate_cache() { + if let Ok(mut guard) = CACHE.lock() { + *guard = None; + } +} diff --git a/src/apps/desktop/src/computer_use/mod.rs b/src/apps/desktop/src/computer_use/mod.rs index b70741e10..c444bece5 100644 --- a/src/apps/desktop/src/computer_use/mod.rs +++ b/src/apps/desktop/src/computer_use/mod.rs @@ -1,10 +1,20 @@ //! Desktop Computer use host (screenshots + enigo). mod desktop_host; +mod interactive_filter; +mod som_overlay; #[cfg(target_os = "linux")] mod linux_ax_ui; #[cfg(target_os = "macos")] +mod macos_ax_dump; +#[cfg(target_os = "macos")] mod macos_ax_ui; +#[cfg(target_os = "macos")] +mod macos_ax_write; +#[cfg(target_os = "macos")] +mod macos_bg_input; +#[cfg(target_os = "macos")] +mod macos_list_apps; mod screen_ocr; mod ui_locate_common; #[cfg(target_os = "windows")] diff --git a/src/apps/desktop/src/computer_use/screen_ocr.rs b/src/apps/desktop/src/computer_use/screen_ocr.rs index 807e247dc..7fa436c75 100644 --- a/src/apps/desktop/src/computer_use/screen_ocr.rs +++ b/src/apps/desktop/src/computer_use/screen_ocr.rs @@ -428,6 +428,7 @@ pub fn crop_shot_to_ocr_region( let native_h = (native_bottom - native_top).round().max(1.0) as u32; Ok(ComputerScreenshot { + screenshot_id: None, bytes: buf, mime_type: "image/jpeg".to_string(), image_width: cropped.width(), @@ -449,6 +450,14 @@ pub fn crop_shot_to_ocr_region( width: cropped.width(), height: cropped.height(), }), + image_global_bounds: Some( + bitfun_core::agentic::tools::computer_use_host::ComputerUseImageGlobalBounds { + left: native_left, + top: native_top, + width: native_w as f64, + height: native_h as f64, + }, + ), implicit_confirmation_crop_applied: false, ui_tree_text: None, }) diff --git a/src/apps/desktop/src/computer_use/som_overlay.rs b/src/apps/desktop/src/computer_use/som_overlay.rs new file mode 100644 index 000000000..81e655e9b --- /dev/null +++ b/src/apps/desktop/src/computer_use/som_overlay.rs @@ -0,0 +1,314 @@ +//! Set-of-Mark overlay renderer. +//! +//! Takes a JPEG screenshot + a list of [`InteractiveElement`]s and paints +//! numbered coloured boxes (one per element). The result is encoded back +//! into JPEG so the host can return it inside a [`ComputerScreenshot`] +//! without changing any downstream wiring. +//! +//! Design choices that matter for the model: +//! * Each element gets a small high-contrast badge containing its `i` +//! index in the **top-left corner** of its rectangle (TuriX-CUA +//! convention — the model is trained to look for `[N]` markers in +//! that location). +//! * Box colour is keyed off the AX role so the model can disambiguate +//! visually similar widgets (e.g. button vs. text field) without +//! reading the tree text. +//! * Badges drift down/right when they would overlap the previous +//! element's badge — keeps the overlay legible on dense menus. +//! * Font is a small 5×7 monochrome bitmap baked into this file; no +//! extra runtime dependencies (rusttype / ab_glyph / imageproc are +//! not pulled in). + +#![allow(dead_code)] + +use bitfun_core::agentic::tools::computer_use_host::InteractiveElement; +use bitfun_core::util::errors::{BitFunError, BitFunResult}; +use image::{ImageOutputFormat, Rgba, RgbaImage}; +use std::io::Cursor; + +/// Render the SoM overlay onto `jpeg_bytes` and return a fresh JPEG. +/// +/// `jpeg_quality` defaults to 80 when `None`. Elements whose +/// `frame_image` is `None` are skipped silently. +pub(crate) fn render_overlay( + jpeg_bytes: &[u8], + elements: &[InteractiveElement], + jpeg_quality: Option, +) -> BitFunResult> { + let img = image::load_from_memory_with_format(jpeg_bytes, image::ImageFormat::Jpeg) + .map_err(|e| BitFunError::tool(format!("som_overlay: decode JPEG failed: {e}")))? + .to_rgba8(); + let mut canvas: RgbaImage = img; + + let mut placed_badges: Vec<(i32, i32, i32, i32)> = Vec::with_capacity(elements.len()); + + for el in elements { + let Some((x, y, w, h)) = el.frame_image else { + continue; + }; + if w == 0 || h == 0 { + continue; + } + let color = role_color(&el.role, el.subrole.as_deref()); + + draw_rect_outline(&mut canvas, x as i32, y as i32, w as i32, h as i32, color, 2); + + let label = format!("{}", el.i); + let badge_w = (label.len() as i32) * (CHAR_W as i32 + 1) + 5; + let badge_h = CHAR_H as i32 + 4; + let mut bx = x as i32; + let mut by = y as i32 - badge_h; + if by < 0 { + by = y as i32; + } + + // Slide the badge along the top edge until it does not collide + // with another element's badge (cap retries to avoid blowups). + for _ in 0..6 { + let collides = placed_badges + .iter() + .any(|(px, py, pw, ph)| rects_overlap(bx, by, badge_w, badge_h, *px, *py, *pw, *ph)); + if !collides { + break; + } + bx += badge_w + 2; + if bx + badge_w > canvas.width() as i32 { + bx = x as i32; + by += badge_h + 2; + } + } + + draw_filled_rect(&mut canvas, bx, by, badge_w, badge_h, color); + draw_rect_outline(&mut canvas, bx, by, badge_w, badge_h, BADGE_BORDER, 1); + draw_text(&mut canvas, bx + 3, by + 2, &label, BADGE_TEXT); + + placed_badges.push((bx, by, badge_w, badge_h)); + } + + let mut out = Vec::with_capacity(jpeg_bytes.len()); + let quality = jpeg_quality.unwrap_or(80); + image::DynamicImage::ImageRgba8(canvas) + .write_to(&mut Cursor::new(&mut out), ImageOutputFormat::Jpeg(quality)) + .map_err(|e| BitFunError::tool(format!("som_overlay: encode JPEG failed: {e}")))?; + Ok(out) +} + +const BADGE_BORDER: Rgba = Rgba([0, 0, 0, 255]); +const BADGE_TEXT: Rgba = Rgba([255, 255, 255, 255]); + +fn role_color(role: &str, subrole: Option<&str>) -> Rgba { + if let Some(sr) = subrole { + match sr { + "AXCloseButton" | "AXMinimizeButton" | "AXFullScreenButton" => { + return Rgba([200, 80, 80, 255]) + } + "AXSecureTextField" => return Rgba([90, 110, 220, 255]), + _ => {} + } + } + match role { + "AXButton" | "AXMenuButton" | "AXPopUpButton" => Rgba([220, 60, 60, 255]), + "AXTextField" | "AXSecureTextField" | "AXSearchField" | "AXTextArea" => { + Rgba([60, 110, 220, 255]) + } + "AXCheckBox" | "AXRadioButton" | "AXSwitch" | "AXToggle" => Rgba([200, 130, 30, 255]), + "AXLink" => Rgba([60, 160, 220, 255]), + "AXTab" | "AXTabGroup" => Rgba([130, 80, 200, 255]), + "AXMenu" | "AXMenuItem" | "AXMenuBarItem" => Rgba([180, 90, 180, 255]), + "AXSlider" | "AXIncrementor" | "AXStepper" => Rgba([60, 170, 130, 255]), + "AXRow" | "AXOutlineRow" | "AXCell" => Rgba([100, 140, 100, 255]), + _ => Rgba([90, 90, 90, 255]), + } +} + +fn rects_overlap( + ax: i32, + ay: i32, + aw: i32, + ah: i32, + bx: i32, + by: i32, + bw: i32, + bh: i32, +) -> bool { + !(ax + aw <= bx || bx + bw <= ax || ay + ah <= by || by + bh <= ay) +} + +fn draw_rect_outline( + img: &mut RgbaImage, + x: i32, + y: i32, + w: i32, + h: i32, + color: Rgba, + thickness: i32, +) { + if w <= 0 || h <= 0 { + return; + } + let iw = img.width() as i32; + let ih = img.height() as i32; + let x0 = x.max(0); + let y0 = y.max(0); + let x1 = (x + w).min(iw); + let y1 = (y + h).min(ih); + if x1 <= x0 || y1 <= y0 { + return; + } + for t in 0..thickness { + // Top + bottom edges. + for px in x0..x1 { + put_pixel(img, px, y0 + t, color); + put_pixel(img, px, y1 - 1 - t, color); + } + // Left + right edges. + for py in y0..y1 { + put_pixel(img, x0 + t, py, color); + put_pixel(img, x1 - 1 - t, py, color); + } + } +} + +fn draw_filled_rect(img: &mut RgbaImage, x: i32, y: i32, w: i32, h: i32, color: Rgba) { + if w <= 0 || h <= 0 { + return; + } + let iw = img.width() as i32; + let ih = img.height() as i32; + let x0 = x.max(0); + let y0 = y.max(0); + let x1 = (x + w).min(iw); + let y1 = (y + h).min(ih); + for py in y0..y1 { + for px in x0..x1 { + put_pixel(img, px, py, color); + } + } +} + +#[inline] +fn put_pixel(img: &mut RgbaImage, x: i32, y: i32, color: Rgba) { + if x >= 0 && y >= 0 && (x as u32) < img.width() && (y as u32) < img.height() { + // Alpha blend. + let dst = img.get_pixel_mut(x as u32, y as u32); + let a = color.0[3] as u32; + if a == 255 { + *dst = color; + return; + } + let inv = 255 - a; + for c in 0..3 { + dst.0[c] = ((color.0[c] as u32 * a + dst.0[c] as u32 * inv) / 255) as u8; + } + dst.0[3] = 255; + } +} + +fn draw_text(img: &mut RgbaImage, x: i32, y: i32, text: &str, color: Rgba) { + let mut cx = x; + for ch in text.chars() { + if let Some(glyph) = glyph_for(ch) { + for (row_idx, row) in glyph.iter().enumerate() { + for col in 0..CHAR_W { + let bit = (row >> (CHAR_W - 1 - col)) & 1; + if bit == 1 { + put_pixel(img, cx + col as i32, y + row_idx as i32, color); + } + } + } + } + cx += CHAR_W as i32 + 1; + } +} + +const CHAR_W: usize = 5; +const CHAR_H: usize = 7; + +/// 5×7 bitmap font, just enough for the digits 0-9 (badge labels). +fn glyph_for(ch: char) -> Option<[u8; CHAR_H]> { + match ch { + '0' => Some([0b01110, 0b10001, 0b10011, 0b10101, 0b11001, 0b10001, 0b01110]), + '1' => Some([0b00100, 0b01100, 0b00100, 0b00100, 0b00100, 0b00100, 0b01110]), + '2' => Some([0b01110, 0b10001, 0b00001, 0b00010, 0b00100, 0b01000, 0b11111]), + '3' => Some([0b11110, 0b00001, 0b00001, 0b01110, 0b00001, 0b00001, 0b11110]), + '4' => Some([0b00010, 0b00110, 0b01010, 0b10010, 0b11111, 0b00010, 0b00010]), + '5' => Some([0b11111, 0b10000, 0b11110, 0b00001, 0b00001, 0b10001, 0b01110]), + '6' => Some([0b00110, 0b01000, 0b10000, 0b11110, 0b10001, 0b10001, 0b01110]), + '7' => Some([0b11111, 0b00001, 0b00010, 0b00100, 0b01000, 0b01000, 0b01000]), + '8' => Some([0b01110, 0b10001, 0b10001, 0b01110, 0b10001, 0b10001, 0b01110]), + '9' => Some([0b01110, 0b10001, 0b10001, 0b01111, 0b00001, 0b00010, 0b01100]), + _ => None, + } +} + +#[allow(dead_code)] +pub(crate) fn draw_text_for_test(img: &mut RgbaImage, x: i32, y: i32, text: &str) { + draw_text(img, x, y, text, Rgba([255, 255, 255, 255])); +} + +#[cfg(test)] +mod tests { + use super::*; + use image::{ImageBuffer, ImageEncoder}; + + fn solid_jpeg(w: u32, h: u32) -> Vec { + let mut buf: ImageBuffer, Vec> = ImageBuffer::new(w, h); + for px in buf.pixels_mut() { + *px = Rgba([20, 20, 20, 255]); + } + let mut out = Vec::new(); + let rgb = image::DynamicImage::ImageRgba8(buf).to_rgb8(); + let encoder = image::codecs::jpeg::JpegEncoder::new_with_quality(&mut out, 90); + encoder + .write_image(rgb.as_raw(), w, h, image::ColorType::Rgb8) + .unwrap(); + out + } + + fn elem(i: u32, role: &str, frame: (u32, u32, u32, u32)) -> InteractiveElement { + InteractiveElement { + i, + node_idx: i + 100, + role: role.to_string(), + subrole: None, + label: Some(format!("e{i}")), + frame_image: Some(frame), + frame_global: None, + enabled: true, + focused: false, + ax_actionable: true, + } + } + + #[test] + fn renders_without_panic_and_returns_valid_jpeg() { + let jpeg = solid_jpeg(200, 120); + let elements = vec![ + elem(0, "AXButton", (10, 10, 60, 30)), + elem(1, "AXTextField", (80, 10, 100, 30)), + elem(2, "AXLink", (10, 60, 50, 20)), + ]; + let out = render_overlay(&jpeg, &elements, Some(75)).expect("overlay encode"); + let decoded = image::load_from_memory(&out).expect("decode overlay"); + assert_eq!(decoded.width(), 200); + assert_eq!(decoded.height(), 120); + } + + #[test] + fn skips_elements_without_frame() { + let jpeg = solid_jpeg(120, 80); + let mut e = elem(0, "AXButton", (10, 10, 30, 20)); + e.frame_image = None; + let out = render_overlay(&jpeg, &[e], None).expect("overlay"); + let _ = image::load_from_memory(&out).expect("decode overlay"); + } + + #[test] + fn handles_overflowing_rect() { + let jpeg = solid_jpeg(80, 60); + let elements = vec![elem(99, "AXButton", (70, 50, 200, 200))]; + let out = render_overlay(&jpeg, &elements, None).expect("overlay"); + let decoded = image::load_from_memory(&out).expect("decode overlay"); + assert_eq!(decoded.width(), 80); + } +} diff --git a/src/apps/desktop/src/computer_use/ui_locate_common.rs b/src/apps/desktop/src/computer_use/ui_locate_common.rs index 2bf6690dd..96b84470a 100644 --- a/src/apps/desktop/src/computer_use/ui_locate_common.rs +++ b/src/apps/desktop/src/computer_use/ui_locate_common.rs @@ -5,11 +5,20 @@ use bitfun_core::util::errors::{BitFunError, BitFunResult}; use screenshots::display_info::DisplayInfo; pub fn validate_query(q: &UiElementLocateQuery) -> BitFunResult<()> { + // node_idx alone is enough: it short-circuits BFS via the per-pid AX cache. + if q.node_idx.is_some() { + return Ok(()); + } let t = q .title_contains .as_ref() .map(|s| !s.trim().is_empty()) .unwrap_or(false); + let tx = q + .text_contains + .as_ref() + .map(|s| !s.trim().is_empty()) + .unwrap_or(false); let r = q .role_substring .as_ref() @@ -20,15 +29,44 @@ pub fn validate_query(q: &UiElementLocateQuery) -> BitFunResult<()> { .as_ref() .map(|s| !s.trim().is_empty()) .unwrap_or(false); - if !t && !r && !i { + if !t && !tx && !r && !i { return Err(BitFunError::tool( - "Provide at least one of: title_contains, role_substring, identifier_contains (non-empty)." + "Provide at least one of: node_idx, text_contains, title_contains, role_substring, identifier_contains (non-empty)." .to_string(), )); } Ok(()) } +/// All AX text-bearing attributes considered by `matches_filters` / ranking. +/// Pass `None` for anything the platform host can't read (e.g. AT-SPI lacks `help`). +#[derive(Debug, Clone, Copy, Default)] +pub struct NodeAttrs<'a> { + pub role: Option<&'a str>, + pub subrole: Option<&'a str>, + pub title: Option<&'a str>, + pub value: Option<&'a str>, + pub description: Option<&'a str>, + pub identifier: Option<&'a str>, + pub help: Option<&'a str>, +} + +impl<'a> NodeAttrs<'a> { + /// Convenience for the legacy three-field path (role/title/ident). + pub fn legacy( + role: Option<&'a str>, + title: Option<&'a str>, + identifier: Option<&'a str>, + ) -> Self { + Self { + role, + title, + identifier, + ..Self::default() + } + } +} + fn global_xy_to_native_with_display(d: &DisplayInfo, gx: f64, gy: f64) -> BitFunResult<(u32, u32)> { // Phase 1 fix: `DisplayInfo.width / height` are **logical** points, and // `scale_factor` is the device pixel ratio (2.0 on Retina, 1.5/1.75 on @@ -133,35 +171,78 @@ fn combine_is_any(query: &UiElementLocateQuery) -> bool { matches!(query.filter_combine.as_deref(), Some("any") | Some("or")) } +/// `role_substring` evaluator that also considers `subrole` (macOS often distinguishes +/// "search field" from "plain text field" only via `AXSubrole`). +fn role_or_subrole_matches(role: Option<&str>, subrole: Option<&str>, want: &str) -> bool { + if role_substring_matches_ax_role(role.unwrap_or(""), want) { + return true; + } + if let Some(sr) = subrole { + if !sr.is_empty() && contains_ci(sr, want) { + return true; + } + } + false +} + +/// `text_contains` semantics: case-insensitive substring match against any of +/// `title | value | description | help`. +fn text_contains_matches(n: &NodeAttrs<'_>, want: &str) -> bool { + let w = want.trim(); + if w.is_empty() { + return true; + } + if contains_ci(n.title.unwrap_or(""), w) { + return true; + } + if contains_ci(n.value.unwrap_or(""), w) { + return true; + } + if contains_ci(n.description.unwrap_or(""), w) { + return true; + } + if contains_ci(n.help.unwrap_or(""), w) { + return true; + } + false +} + /// OR semantics: element matches if **at least one** non-empty filter matches. -pub fn matches_filters_any( - query: &UiElementLocateQuery, - role: Option<&str>, - title: Option<&str>, - ident: Option<&str>, -) -> bool { +pub fn matches_filters_any_attrs(query: &UiElementLocateQuery, n: &NodeAttrs<'_>) -> bool { let mut has_filter = false; let mut matched = false; if let Some(ref want) = query.role_substring { - if !want.trim().is_empty() { + let w = want.trim(); + if !w.is_empty() { has_filter = true; - if role_substring_matches_ax_role(role.unwrap_or(""), want.trim()) { + if role_or_subrole_matches(n.role, n.subrole, w) { matched = true; } } } if let Some(ref want) = query.title_contains { - if !want.trim().is_empty() { + let w = want.trim(); + if !w.is_empty() { + has_filter = true; + if contains_ci(n.title.unwrap_or(""), w) { + matched = true; + } + } + } + if let Some(ref want) = query.text_contains { + let w = want.trim(); + if !w.is_empty() { has_filter = true; - if contains_ci(title.unwrap_or(""), want.trim()) { + if text_contains_matches(n, w) { matched = true; } } } if let Some(ref want) = query.identifier_contains { - if !want.trim().is_empty() { + let w = want.trim(); + if !w.is_empty() { has_filter = true; - if contains_ci(ident.unwrap_or(""), want.trim()) { + if contains_ci(n.identifier.unwrap_or(""), w) { matched = true; } } @@ -170,50 +251,73 @@ pub fn matches_filters_any( } /// AND semantics (default): **every** non-empty filter must match the same element. -pub fn matches_filters_all( - query: &UiElementLocateQuery, - role: Option<&str>, - title: Option<&str>, - ident: Option<&str>, -) -> bool { +pub fn matches_filters_all_attrs(query: &UiElementLocateQuery, n: &NodeAttrs<'_>) -> bool { if let Some(ref want) = query.role_substring { - if !want.trim().is_empty() { - let r = role.unwrap_or(""); - if !role_substring_matches_ax_role(r, want.trim()) { - return false; - } + let w = want.trim(); + if !w.is_empty() && !role_or_subrole_matches(n.role, n.subrole, w) { + return false; } } if let Some(ref want) = query.title_contains { - if !want.trim().is_empty() { - let t = title.unwrap_or(""); - if !contains_ci(t, want.trim()) { - return false; - } + let w = want.trim(); + if !w.is_empty() && !contains_ci(n.title.unwrap_or(""), w) { + return false; + } + } + if let Some(ref want) = query.text_contains { + let w = want.trim(); + if !w.is_empty() && !text_contains_matches(n, w) { + return false; } } if let Some(ref want) = query.identifier_contains { - if !want.trim().is_empty() { - let i = ident.unwrap_or(""); - if !contains_ci(i, want.trim()) { - return false; - } + let w = want.trim(); + if !w.is_empty() && !contains_ci(n.identifier.unwrap_or(""), w) { + return false; } } true } +/// Structured matcher (preferred, used by macOS host). +pub fn matches_filters_attrs(query: &UiElementLocateQuery, n: &NodeAttrs<'_>) -> bool { + if combine_is_any(query) { + matches_filters_any_attrs(query, n) + } else { + matches_filters_all_attrs(query, n) + } +} + +/// Legacy three-field shim — preserved so linux/windows hosts compile while they migrate. +/// New code should construct `NodeAttrs` and call [`matches_filters_attrs`] directly. +#[allow(dead_code)] pub fn matches_filters( query: &UiElementLocateQuery, role: Option<&str>, title: Option<&str>, ident: Option<&str>, ) -> bool { - if combine_is_any(query) { - matches_filters_any(query, role, title, ident) - } else { - matches_filters_all(query, role, title, ident) - } + matches_filters_attrs(query, &NodeAttrs::legacy(role, title, ident)) +} + +#[allow(dead_code)] +pub fn matches_filters_any( + query: &UiElementLocateQuery, + role: Option<&str>, + title: Option<&str>, + ident: Option<&str>, +) -> bool { + matches_filters_any_attrs(query, &NodeAttrs::legacy(role, title, ident)) +} + +#[allow(dead_code)] +pub fn matches_filters_all( + query: &UiElementLocateQuery, + role: Option<&str>, + title: Option<&str>, + ident: Option<&str>, +) -> bool { + matches_filters_all_attrs(query, &NodeAttrs::legacy(role, title, ident)) } #[allow(dead_code)] // Used by windows_ax_ui / linux_ax_ui (not compiled on macOS) @@ -292,9 +396,49 @@ pub fn ok_result_with_context( parent_context, total_matches, other_matches, + matched_node_idx: None, + matched_via: None, }) } +/// Same as [`ok_result_with_context`] plus traceability fields for `matched_node_idx` / +/// `matched_via`. New code should prefer this entry point. +#[allow(clippy::too_many_arguments)] +pub fn ok_result_with_context_full( + gx: f64, + gy: f64, + bounds_left: f64, + bounds_top: f64, + bounds_width: f64, + bounds_height: f64, + matched_role: String, + matched_title: Option, + matched_identifier: Option, + parent_context: Option, + total_matches: u32, + other_matches: Vec, + matched_node_idx: Option, + matched_via: Option, +) -> BitFunResult { + let mut r = ok_result_with_context( + gx, + gy, + bounds_left, + bounds_top, + bounds_width, + bounds_height, + matched_role, + matched_title, + matched_identifier, + parent_context, + total_matches, + other_matches, + )?; + r.matched_node_idx = matched_node_idx; + r.matched_via = matched_via; + Ok(r) +} + #[cfg(test)] mod tests { use super::*; @@ -346,6 +490,95 @@ mod tests { assert_eq!((nx, ny), (1440, 810)); } + fn q_text(needle: &str) -> UiElementLocateQuery { + UiElementLocateQuery { + text_contains: Some(needle.to_string()), + ..Default::default() + } + } + + #[test] + fn text_contains_matches_value_or_description() { + let q = q_text("五子棋"); + let n_value = NodeAttrs { + role: Some("AXStaticText"), + value: Some("五子棋 - 经典对战"), + ..Default::default() + }; + assert!(matches_filters_attrs(&q, &n_value)); + + let n_desc = NodeAttrs { + role: Some("AXButton"), + description: Some("打开五子棋"), + ..Default::default() + }; + assert!(matches_filters_attrs(&q, &n_desc)); + + let n_help = NodeAttrs { + role: Some("AXImage"), + help: Some("Five In A Row 五子棋"), + ..Default::default() + }; + assert!(matches_filters_attrs(&q, &n_help)); + } + + #[test] + fn text_contains_does_not_change_title_only_semantic() { + // title_contains MUST still only inspect AXTitle; value/description should be ignored. + let q = UiElementLocateQuery { + title_contains: Some("Send".to_string()), + ..Default::default() + }; + let n = NodeAttrs { + role: Some("AXButton"), + title: None, + value: Some("Send"), + description: Some("Send message"), + ..Default::default() + }; + assert!(!matches_filters_attrs(&q, &n)); + + let n2 = NodeAttrs { + role: Some("AXButton"), + title: Some("Send"), + ..Default::default() + }; + assert!(matches_filters_attrs(&q, &n2)); + } + + #[test] + fn role_substring_matches_subrole() { + let q = UiElementLocateQuery { + role_substring: Some("SearchField".to_string()), + ..Default::default() + }; + // Real role is generic AXTextField, but subrole carries AXSearchField. + let n = NodeAttrs { + role: Some("AXTextField"), + subrole: Some("AXSearchField"), + ..Default::default() + }; + assert!(matches_filters_attrs(&q, &n)); + } + + #[test] + fn validate_query_accepts_node_idx_alone() { + let q = UiElementLocateQuery { + node_idx: Some(7), + ..Default::default() + }; + assert!(validate_query(&q).is_ok()); + } + + #[test] + fn validate_query_accepts_text_contains_alone() { + let q = UiElementLocateQuery { + text_contains: Some("OK".to_string()), + ..Default::default() + }; + assert!(validate_query(&q).is_ok()); + } + #[test] fn maps_global_to_native_with_unit_scale_is_identity() { let d = fake_display(0, 0, 800, 600, 1.0); diff --git a/src/apps/desktop/src/computer_use/windows_ax_ui.rs b/src/apps/desktop/src/computer_use/windows_ax_ui.rs index 53235d48a..d6e9a6747 100644 --- a/src/apps/desktop/src/computer_use/windows_ax_ui.rs +++ b/src/apps/desktop/src/computer_use/windows_ax_ui.rs @@ -52,6 +52,15 @@ pub fn locate_ui_element_center( query: &UiElementLocateQuery, ) -> BitFunResult { ui_locate_common::validate_query(query)?; + + if query.node_idx.is_some() { + return Err(BitFunError::tool( + "[AX_IDX_NOT_SUPPORTED] node_idx lookup is only implemented on macOS. \ + Fall back to `text_contains` / `title_contains` + `role_substring` on this host." + .to_string(), + )); + } + let max_depth = query.max_depth.unwrap_or(48).clamp(1, 200); let max_nodes = 12_000usize; @@ -129,13 +138,28 @@ pub fn locate_ui_element_center( .unwrap_or_default() }; let role = localized_control_type_string(&cur.el); + let help = unsafe { + cur.el + .CurrentHelpText() + .ok() + .map(bstr_to_string) + .unwrap_or_default() + }; - let matched = ui_locate_common::matches_filters( - query, - Some(role.as_str()), - Some(name.as_str()), - Some(ident.as_str()), - ); + let attrs = ui_locate_common::NodeAttrs { + role: Some(role.as_str()), + subrole: None, + title: Some(name.as_str()), + value: None, + description: None, + identifier: Some(ident.as_str()), + help: if help.is_empty() { + None + } else { + Some(help.as_str()) + }, + }; + let matched = ui_locate_common::matches_filters_attrs(query, &attrs); if matched { let rect = unsafe { cur.el.CurrentBoundingRectangle() }; if let Ok(r) = rect { diff --git a/src/crates/core/src/agentic/agents/prompts/claw_mode.md b/src/crates/core/src/agentic/agents/prompts/claw_mode.md index ee72e4e81..0d326766e 100644 --- a/src/crates/core/src/agentic/agents/prompts/claw_mode.md +++ b/src/crates/core/src/agentic/agents/prompts/claw_mode.md @@ -72,7 +72,50 @@ On multi-monitor setups, **never** assume the cursor is on the screen the user i In both patterns, after a pin every `screenshot` is guaranteed to come from that display until cleared. ## `domain: "desktop"` — actions and policies (Computer Use) -The actions inside `domain: "desktop"` are: `click_element`, `move_to_text`, `click`, `mouse_move`, `scroll`, `drag`, `screenshot`, `locate`, `key_chord`, `type_text`, `paste`, `pointer_move_rel`, `wait`. Every example in this section is a `domain: "desktop"` call — substitute the action name into `params`. +The actions inside `domain: "desktop"` are: `click_element`, `move_to_text`, `click`, `mouse_move`, `scroll`, `drag`, `screenshot`, `locate`, `key_chord`, `type_text`, `paste`, `pointer_move_rel`, `wait`. AX-first additions (Codex parity, **prefer when `meta.capabilities.domains.desktop.supports_background_input` is true on macOS**): `list_apps`, `get_app_state`, `app_click`, `app_type_text`, `app_scroll`, `app_key_chord`, `app_wait_for`. **Interactive-View-first (TuriX-style Set-of-Mark) — STRONGLY PREFERRED on macOS when available**: `build_interactive_view`, `interactive_click`, `interactive_type_text`, `interactive_scroll`. Every example in this section is a `domain: "desktop"` call — substitute the action name into `params`. + +### Interactive-View-first workflow (macOS, Set-of-Mark) — DEFAULT for visible UI on macOS +When background input + AX tree are supported, this is the **preferred** path for any third-party GUI work. It collapses "find element + addressing + click" into a single visual handle: the **`i`** index of a numbered coloured box drawn on the focused window screenshot. The model never invents pixel coordinates and never has to translate `node_idx` ↔ JPEG. + +1. `desktop.list_apps {}` → pick `{ pid }` (or `{ bundle_id }` / `{ name }`). +2. `desktop.build_interactive_view { app: { pid: } }` → returns a focused-window screenshot **with numbered coloured boxes overlaid**, plus `elements[]` (each item: `i`, `role`, `subrole`, `label`, `frame_image`, `frame_global`, `enabled`, `focused`), a compact `tree_text`, and a stable `digest`. **Reference elements ONLY by their `i` index** in subsequent calls. Colour key: blue=button, green=text-field/textarea, orange=link, purple=menu/popup, red=focused, gray=other. + - Useful options: `opts.focus_window_only` (default `true`), `opts.max_elements` (default ~80; host trims by visual area), `opts.annotate_screenshot` (default `true` — set `false` to save overlay cost on retries), `opts.include_tree_text` (default `true`). +3. Act with the **index-targeted** variants. Always echo `before_view_digest: ""` so the host can detect a stale view (UI changed under you). The host accepts either the full digest or any prefix of **at least 8 characters** (the 12-char digest shown in `summary` is a valid shorthand): + - `desktop.interactive_click { app: {pid:N}, i: K, before_view_digest: "" }` — accepts `click_count`, `mouse_button`, `modifier_keys`, `wait_ms_after`, `return_view` (default `true`, host re-renders the view for the next turn). + - `desktop.interactive_type_text { app: {pid:N}, i: K, text: "...", before_view_digest: "", clear_first?: true, press_enter_after?: false }` — omit `i` to type into whatever element is currently focused. + - `desktop.interactive_scroll { app: {pid:N}, i: K, dy: -3, dx: 0, before_view_digest: "" }` — omit `i` to scroll the focused window centre. +4. The action response carries the post-action `app_state` (with screenshot) AND, when `return_view=true`, a fresh `interactive_view` (new `digest`, new numbered overlay). **Use the new `digest` for the next call.** When you see `interactive_view: null` (you set `return_view=false`, or the rebuild failed), call `build_interactive_view` again before the next `i`-addressed action. +5. Errors you may see: `INTERACTIVE_VIEW_STALE` (`before_view_digest` no longer matches the cached view — re-run `build_interactive_view` and reuse the new `i`/`digest`), `INTERACTIVE_INDEX_OUT_OF_RANGE` (the `i` is not in the current cached view — same fix), `INTERACTIVE_VIEW_UNAVAILABLE` (host doesn't support SoM — fall back to AX-first below). + +**MANDATORY OBSERVE → PLAN → EXPECT → VERIFY loop (every interactive turn):** +For each `interactive_*` action you take, your visible reasoning MUST contain four short labelled lines BEFORE the tool call, and one VERIFY line in the next turn AFTER the response. This is the single biggest accuracy lever vs. ad-hoc clicking. +1. **OBSERVE:** the exact `i`, `role`, `label`, and on-screen position you are about to act on (one line, copied from the latest `elements[]` / annotated overlay). If `elements[]` is older than the previous action, **rebuild the view first** — never guess. +2. **PLAN:** the single concrete action and parameters (`interactive_click { i: 7, ... }`), and the prefix/full `digest` you will pass. +3. **EXPECT:** in one sentence, the visible UI change you predict — e.g. "the popup closes and a new modal titled 'Game' appears", "input field 12 gains focus and shows the text I typed". Be specific enough that the next screenshot can falsify it. +4. **(Tool call)**. +5. **VERIFY (next turn, before any further action):** compare the returned `interactive_view` overlay + `app_state` to your EXPECT line. State explicitly **PASS** or **FAIL: **. On FAIL: do **not** retry the same action — re-OBSERVE the new view and pick a different element / different action. + - Treat `execution_note` containing `auto_rebuilt_view_after_stale` or `fallback_image_xy` as soft warnings — the click landed but via a recovery path; double-check the EXPECT before continuing. + - For repeated FAIL on the same target across two turns: switch tactic — try `key_chord` (keyboard nav), `move_to_text` (OCR), or `app_click { target: { ocr_text } }` (OCR-based fallback) instead of clicking the same `i` again. + +**When to fall back from Interactive-View-first to AX-first:** +- `meta.capabilities.domains.desktop.supports_interactive_view` is **false** (non-macOS). +- The target widget is not in `elements[]` (e.g. Canvas / WebGL / custom-drawn surfaces). Use `desktop.app_click { target: { ocr_text: { needle: "..." } } }` instead. +- You need AX-only operations not yet exposed via the index API (e.g. `app_wait_for`, `app_key_chord` with `focus_idx`). + +### AX-first workflow (macOS, third-party apps) — fallback when Interactive-View is unavailable +When background input + AX tree are supported, drive the target app **without** stealing the user's foreground focus or cursor: +1. `desktop.list_apps {}` → pick `{ pid }` (or `{ bundle_id }` / `{ name }`). +2. `desktop.get_app_state { app: { pid: } }` → read `app_state.tree_text` + `app_state_nodes[]`. Each node has a stable `idx` you address in subsequent calls. Remember `before_digest` for change detection. +3. Act with the **node-targeted** variants — they try the AX action path (`AXPress` / `AXSetAttributeValue`) first and only fall back to PID-scoped synthetic events if the node refuses: + - `desktop.app_click { app: {pid:N}, target: { node_idx: K } }` + - `desktop.app_type_text { app: {pid:N}, text: "...", focus: { node_idx: K } }` + - `desktop.app_scroll { app: {pid:N}, dx: 0, dy: -120, focus: { node_idx: K } }` + - `desktop.app_key_chord { app: {pid:N}, keys: ["command","f"], focus_idx: K }` + - When the AX tree does NOT expose the target widget (Canvas, WebGL, custom-drawn cells, third-party games), use the OCR fallback: `desktop.app_click { app: {pid:N}, target: { ocr_text: { needle: "Start" } } }`. The host screenshots, OCRs, picks the highest-confidence match, and clicks its centre — all still PID-scoped so the user's cursor never moves. Prefer node_idx whenever it works (faster + no OCR confidence noise). +4. After acting, the response already contains the **after** `app_state` + `app_state_nodes` — diff against `before_digest`. If you need to wait for an async UI transition use `desktop.app_wait_for { app, predicate: { digest_changed: { prev_digest } } | { title_contains: "..." } | { role_enabled: { role, title } } | { node_enabled: { idx } }, timeout_ms, poll_ms }`. +5. Errors you may see: `APP_NOT_FOUND` (selector didn't resolve a running PID), `AX_NODE_STALE` (the cached `idx` no longer points to a live element — re-snapshot with `get_app_state`), `BACKGROUND_INPUT_UNAVAILABLE` (Accessibility permission missing or non-macOS — fall back to legacy `click` / `type_text` / `paste`). + +If `meta.capabilities.domains.desktop.supports_background_input` is **false** (Linux / Windows / unprivileged macOS), do NOT use the `app_*` actions; they will fail with `BACKGROUND_INPUT_UNAVAILABLE`. Use the legacy screen-coordinate actions instead. ### Entering text — `paste` is the default, `type_text` is the fallback (MANDATORY) **For ANY of these, use `desktop.paste { text, submit?, clear_first? }`, NEVER `type_text`:** @@ -198,24 +241,35 @@ For Slack / Lark / multi-line apps where Return inserts a newline: ### `click_element` (preferred for most accessibility-backed clicks) Use `click_element` when the target has a known accessible title or role. It locates the element via AX tree, moves the pointer to its center, and clicks -- all in one call. No screenshot needed. Supports `button` (left/right/middle) and `num_clicks` (1/2/3 for single/double/triple click). -**Filter tips:** Use `title_contains` and/or `role_substring` in the **same language as the app UI**. Use `filter_combine: "any"` when fields might not overlap (e.g. text fields with no title). If no match, refine the query or fall back to OCR. Prefer short, distinctive substrings. If a call returns no match, **change the query** before retrying. +**Filter priority (use the first one that fits):** +1. **`node_idx`** (+ optional `app_state_digest`) — if you just called `desktop.get_app_state`, reuse the `idx` directly. One AX lookup, zero BFS, zero ambiguity. macOS only; other platforms return `AX_IDX_NOT_SUPPORTED` and you fall through. +2. **`text_contains`** — case-insensitive substring across AXTitle / AXValue / AXDescription / AXHelp. Best default when the visible label is shown via value/description (e.g. cards built from `AXStaticText`). The locator now climbs up to the closest clickable ancestor (`AXButton` / `AXCell` / `AXLink` / …) automatically. +3. **`title_contains` + `role_substring`** — only when you specifically want to constrain by `AXTitle` and a role/subrole hint (`role_substring` also matches `AXSubrole`, e.g. `"SearchField"`). + +Use `filter_combine: "any"` when fields might not overlap (e.g. text fields with no title). If no match, refine the query or fall back to OCR. Prefer short, distinctive substrings. If a call returns no match, **change the query** before retrying. Use the same language as the app UI. **When `click_element` won't work:** Many apps (Electron/web views, custom-drawn UI) have limited AX trees. **Do not** repeat the same `title_contains`/`role_substring` more than twice -- switch to **`move_to_text`** on visible chrome (tabs, buttons, search hints) or screenshot + `mouse_move` + `click`. That is expected, not a bug. -### Screenshot policy -**There is exactly ONE crop policy: every screenshot is either the focused application window (default, via Accessibility) or the full display (fallback). No `~500×500 mouse crop`. No quadrant drilling. No `screenshot_crop_center_*` / `screenshot_navigate_quadrant` / `screenshot_reset_navigation` / `screenshot_implicit_center` — those parameters are silently ignored.** +### Screenshot policy — **screenshots are your eyes** +**Iron rule: never act blind on a desktop UI you have not seen.** The AX tree is metadata; it does not describe Canvas / WebGL / WebView / custom-drawn surfaces (games, charts, maps, video, rich editors). If you have not looked at a pixel image of the current frame, you do not know what is on screen. **Do not click, scroll, type, or press Enter without a recent image.** -The only screenshot option that has any effect today is `screenshot_window` (alias `window`): -- `true` / `"focused"` → force focused-window crop (default, you almost never need to set this explicitly). -- `false` (or omitted) → same default — host still tries focused-window first, falls back to full display if AX cannot resolve it. +**Free screenshots (Codex parity, macOS AX-first / Interactive-View path):** every `desktop.build_interactive_view` / `desktop.interactive_click` / `desktop.interactive_type_text` / `desktop.interactive_scroll` / `desktop.get_app_state` / `desktop.app_click` / `desktop.app_type_text` / `desktop.app_scroll` / `desktop.app_key_chord` / `desktop.app_wait_for` response **auto-attaches a focused-window screenshot** as a multimodal image (the interactive variants attach the **annotated overlay** with numbered boxes). The JSON also exposes `app_state.has_screenshot` + `app_state.screenshot_meta`, and the interactive variants carry an `interactive_view` block with the fresh `digest` and `elements[]`. **Treat the attached image as authoritative for visual state** and reconcile it against `tree_text` / `elements[]` before your next action — if the image and the tree disagree, trust the image and rebuild the view. -**`click` only requires:** a fresh screenshot since the last pointer-changing action (cache invalidation guard). Any screenshot is sufficient — no quadrant drill, no point crop. Prefer `click_element` / `move_to_text` so you don't have to think about coordinates at all. +**Mandatory screenshot moments:** +1. **Task start.** Before the first interaction with any app, call `desktop.get_app_state` (preferred — includes a screenshot for free) **or** `desktop.screenshot { screenshot_window: true }`. No "I'll just click the obvious button" first turn. +2. **After any AX-first action that returns `has_screenshot: false`** (rare — capture failed). Take an explicit `desktop.screenshot` before the next `app_*` call. +3. **After two consecutive failures on the same target** (same `node_idx` / `ocr_text` / coordinate). The host injects `app_state.loop_warning` in this case — when you see it, the **next** action MUST be `desktop.screenshot` (full display, `screenshot_window: false`) and you MUST switch tactic (different node, different OCR phrase, keyboard shortcut, …). Never retry the same target a third time. +4. **Before any `key_chord` containing `return`/`enter`/`kp_enter`** (cache-invalidation guard, unchanged). +5. **Before any `click` driven by JPEG/global coordinates** (cache-invalidation guard, unchanged). -**`key_chord` that includes `return` / `enter` / `kp_enter`** likewise requires a fresh screenshot since the last pointer-changing action. +**Crop policy (unchanged): one crop, two modes.** Every screenshot is either the focused application window (default, via Accessibility) or the full display (fallback). No `~500×500 mouse crop`. No quadrant drilling. `screenshot_crop_center_*` / `screenshot_navigate_quadrant` / `screenshot_reset_navigation` / `screenshot_implicit_center` are silently ignored. The only knob with effect is `screenshot_window` (alias `window`): +- `true` / `"focused"` → force focused-window crop. +- `false` → full display (use this for the **loop-warning recovery** screenshot, so you can see chrome / docks / dialogs that the focused window may have obscured). +- omitted → focused-window first, full display fallback. -**Not** subject to "must screenshot first": `mouse_move`, `scroll`, `drag`, `type_text`, `paste`, `locate`, `wait`, `pointer_move_rel`, `key_chord` **without** Enter/Return, and **`move_to_text`** / **`click_element`**. +**Not** subject to "must screenshot first": `mouse_move`, `scroll`, `drag`, `type_text`, `paste`, `locate`, `wait`, `pointer_move_rel`, `key_chord` **without** Enter/Return, **`move_to_text`** / **`click_element`**, and any `app_*` call (those carry their own auto-screenshot). -**Cadence:** Take **`screenshot`** when you need **visual confirmation**, or when the host requires a fresh capture before **`click`** / Enter. Do **not** add extra screenshots before ordinary moves, typing, or non-Enter shortcuts "just in case." +**Cadence:** the AX-first loop already gives you one image per turn for free — **use it**. Only fall back to a manual `desktop.screenshot` when (a) you need a full-display view, (b) the auto-shot failed, or (c) you are recovering from a `loop_warning`. Do not spam extra screenshots before ordinary moves "just in case" — the auto-attached one already covers you. ### Screenshot path (lowest targeting tier) After **`click_element`** and **`move_to_text`** are exhausted or inappropriate, use **`screenshot`** for **confirmation** -- not for inventing move coordinates. diff --git a/src/crates/core/src/agentic/tools/computer_use_host.rs b/src/crates/core/src/agentic/tools/computer_use_host.rs index 0b376c592..e8f83c1b6 100644 --- a/src/crates/core/src/agentic/tools/computer_use_host.rs +++ b/src/crates/core/src/agentic/tools/computer_use_host.rs @@ -148,11 +148,26 @@ pub struct ComputerUseImageContentRect { pub height: u32, } +/// Approximate global screen rectangle covered by the screenshot image. Values +/// are in the same coordinate space as `ClickTarget::ScreenXy`. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct ComputerUseImageGlobalBounds { + pub left: f64, + pub top: f64, + pub width: f64, + pub height: f64, +} + /// Screenshot payload for the model and for pointer coordinate mapping. /// The `ComputerUse` tool embeds these fields in tool-result JSON and adds **`hierarchical_navigation`** /// (`full_display` vs `region_crop`, plus **`shortcut_policy`**). -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] pub struct ComputerScreenshot { + /// Stable id for this exact screenshot coordinate basis. Follow-up + /// `ClickTarget::ImageXy` / `ImageGrid` calls should pass this id so the + /// host maps image pixels against the same frame the model saw. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub screenshot_id: Option, pub bytes: Vec, pub mime_type: String, /// Dimensions of the image attached for the model (may be downscaled). @@ -186,6 +201,11 @@ pub struct ComputerScreenshot { /// Screen capture rectangle in JPEG pixel coordinates (offset zero when there is no frame padding); `ComputerUseMousePrecise` maps this rect to the display. #[serde(default, skip_serializing_if = "Option::is_none")] pub image_content_rect: Option, + /// Approximate global screen rectangle represented by the screenshot. Use + /// `ClickTarget::ImageXy` when clicking from the attached image; this field + /// is a human/model hint and the host uses its precise internal map. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub image_global_bounds: Option, /// Condensed text representation of the UI tree, focusing on interactive elements (inspired by TuriX-CUA). #[serde(default, skip_serializing_if = "Option::is_none")] pub ui_tree_text: Option, @@ -227,6 +247,13 @@ pub struct OcrTextMatch { pub struct UiElementLocateQuery { #[serde(default)] pub title_contains: Option, + /// **Wide** text needle: matched against `title | value | description | help` of each AX node + /// (case-insensitive substring). Use this when the on-screen visible text is not in `AXTitle` + /// (e.g. a card whose label sits in `AXValue` of a child `AXStaticText`, or a button labelled + /// only via `AXDescription`). Independent of `title_contains` — both can be supplied and + /// `filter_combine` controls the boolean. + #[serde(default)] + pub text_contains: Option, #[serde(default)] pub role_substring: Option, #[serde(default)] @@ -238,6 +265,16 @@ pub struct UiElementLocateQuery { /// `"any"`: at least one non-empty filter matches (OR) — useful when title and role are not both present on one node (e.g. search field with empty AXTitle). #[serde(default)] pub filter_combine: Option, + /// Direct AX-node-index pin from the most recent `get_app_state` snapshot for the same + /// application. When present the host SHORT-CIRCUITS BFS and resolves the node from its + /// per-pid cache. Always preferred over text/role filters when an `AppStateSnapshot` is + /// available — guarantees the exact node the model already saw, not a re-ranked guess. + #[serde(default)] + pub node_idx: Option, + /// Optional digest from the same `AppStateSnapshot` that produced `node_idx`. When set the + /// host returns `AX_IDX_STALE` if the cached snapshot has rotated. Omit for a "loose" lookup. + #[serde(default)] + pub app_state_digest: Option, } /// Matched element geometry from the accessibility tree: center plus **axis-aligned bounds** (four corners). @@ -276,6 +313,16 @@ pub struct UiElementLocateResult { /// Brief descriptions of other matches (up to 4) for disambiguation. #[serde(default, skip_serializing_if = "Vec::is_empty")] pub other_matches: Vec, + /// AX-tree node index of the matched element when resolvable from the most recent + /// `get_app_state` cache (e.g. macOS). Pass back as `node_idx` for the cheapest possible + /// follow-up `click_element` / `locate` call. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub matched_node_idx: Option, + /// Which filter type produced the match: one of `"node_idx" | "text_contains" | + /// "title_contains" | "role_substring" | "identifier_contains" | "climbed"`. + /// `"climbed"` indicates a static-text leaf was promoted to its nearest clickable ancestor. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub matched_via: Option, } /// Hit-tested accessibility node at a global screen point (OCR disambiguation). @@ -557,6 +604,829 @@ pub trait ComputerUseHost: Send + Sync + std::fmt::Debug { fn focused_display_id(&self) -> Option { None } + + // ------------------------------------------------------------------- + // Codex-style AX-first desktop API (Phase 1: trait surface only). + // + // All methods default to `not available` so existing platform hosts + // (macOS/Linux/Windows desktop, headless test hosts) continue to + // compile and behave exactly as before. Concrete implementations are + // landed in subsequent phases (macos_ax_dump, desktop_host PID-events, + // linux/windows AT-SPI/UIA, ControlHub dispatch). + // ------------------------------------------------------------------- + + /// Whether this host can dispatch synthetic input events to a target + /// application **without** stealing the user's foreground focus or + /// moving their physical cursor. macOS desktop will set this to true + /// once the `CGEventPostToPid` + private-source path is wired and the + /// startup self-check passes; non-macOS hosts stay `false` for now. + fn supports_background_input(&self) -> bool { + false + } + + /// Whether this host can dump a structured accessibility tree per + /// running application (Codex-style `` payload). macOS uses + /// AX, Linux uses AT-SPI2, Windows uses UIA. Hosts without an AX + /// backend stay `false` so the model falls back to the screenshot path. + fn supports_ax_tree(&self) -> bool { + false + } + + /// Enumerate running applications, sorted by recency / launch count + /// (Codex's `list_apps`). Default: empty list — callers should treat an + /// empty result as "not available on this host". + async fn list_apps(&self, _include_hidden: bool) -> BitFunResult> { + Ok(vec![]) + } + + /// Dump the accessibility tree of a target application, returning a + /// stable [`AppStateSnapshot`] (Codex's `get_app_state`). Default: + /// unsupported. Implementations cache `idx → element` so + /// [`Self::app_click`] etc. can address nodes by index. + async fn get_app_state( + &self, + _app: AppSelector, + _max_depth: u32, + _focus_window_only: bool, + ) -> BitFunResult { + Err(BitFunError::tool( + "get_app_state is not available on this host.".to_string(), + )) + } + + /// Click inside a target application. When [`ClickTarget::NodeIdx`] is + /// used, the host first tries the AX action path + /// (`AXUIElementPerformAction`) and falls back to a PID-scoped + /// synthetic mouse event. Returns the after-state snapshot so the + /// model can verify the change in a single round-trip. + async fn app_click(&self, _params: AppClickParams) -> BitFunResult { + Err(BitFunError::tool( + "app_click is not available on this host.".to_string(), + )) + } + + /// Type text into a target application, optionally focusing a node + /// first via AX `kAXValue`/`kAXFocused`. Returns the after-state. + async fn app_type_text( + &self, + _app: AppSelector, + _text: &str, + _focus: Option, + ) -> BitFunResult { + Err(BitFunError::tool( + "app_type_text is not available on this host.".to_string(), + )) + } + + /// Scroll inside a target application; `dx`/`dy` are pixel deltas in + /// host pointer space. Optional `focus` narrows the scroll target via + /// AX `kAXScrollPosition`. + async fn app_scroll( + &self, + _app: AppSelector, + _focus: Option, + _dx: i32, + _dy: i32, + ) -> BitFunResult { + Err(BitFunError::tool( + "app_scroll is not available on this host.".to_string(), + )) + } + + /// Send a key chord (e.g. `["command", "f"]`) to a target application + /// via PID-scoped events. Optional `focus_idx` first focuses an AX node. + async fn app_key_chord( + &self, + _app: AppSelector, + _keys: Vec, + _focus_idx: Option, + ) -> BitFunResult { + Err(BitFunError::tool( + "app_key_chord is not available on this host.".to_string(), + )) + } + + /// Poll an application's AX tree until `pred` matches or `timeout_ms` + /// elapses. Returns the matching snapshot. Default: unsupported. + async fn app_wait_for( + &self, + _app: AppSelector, + _pred: AppWaitPredicate, + _timeout_ms: u32, + _poll_ms: u32, + ) -> BitFunResult { + Err(BitFunError::tool( + "app_wait_for is not available on this host.".to_string(), + )) + } + + // ------------------------------------------------------------------- + // Interactive-View (Set-of-Mark) API — TuriX-CUA inspired. + // + // Goal: collapse the model's "where do I click?" decision into a single + // numeric index `i` that is rendered as a coloured numbered box on top + // of a focused-window screenshot. The model picks `i`, the host + // resolves it back to an authoritative AX action — no coordinate + // guessing, no JPEG-pixel arithmetic. + // + // Defaults are `not available` so non-desktop / non-AX hosts continue + // to compile and behave exactly as before. + // ------------------------------------------------------------------- + + /// Whether this host can build a Set-of-Mark interactive view (filtered + /// AX elements + numbered overlay screenshot). Hosts without an AX + /// backend stay `false`. + fn supports_interactive_view(&self) -> bool { + false + } + + /// Build a Set-of-Mark view for the given application: filters the AX + /// tree to interactive elements, assigns a dense `i` index per element, + /// and overlays numbered colour-coded boxes on the focused-window + /// screenshot. The returned [`InteractiveView`] is the **default** input + /// surface the model should use for desktop GUI work. + async fn build_interactive_view( + &self, + _app: AppSelector, + _opts: InteractiveViewOpts, + ) -> BitFunResult { + Err(BitFunError::tool( + "build_interactive_view is not available on this host.".to_string(), + )) + } + + /// Click an element by its [`InteractiveElement::i`] index from the most + /// recent [`InteractiveView`] of the same application. Returns the + /// after-state view (re-built post-action) when `return_view=true`, else + /// just the bare [`AppStateSnapshot`] for cheaper polling. + async fn interactive_click( + &self, + _app: AppSelector, + _params: InteractiveClickParams, + ) -> BitFunResult { + Err(BitFunError::tool( + "interactive_click is not available on this host.".to_string(), + )) + } + + /// Type text into an element by its `i` index (focuses first via AX, + /// then dispatches PID-scoped key events / paste). When `i` is `None`, + /// types into the currently focused element. + async fn interactive_type_text( + &self, + _app: AppSelector, + _params: InteractiveTypeTextParams, + ) -> BitFunResult { + Err(BitFunError::tool( + "interactive_type_text is not available on this host.".to_string(), + )) + } + + /// Scroll inside (or over) an element by its `i` index. Pass `i=None` + /// to scroll over the focused window. + async fn interactive_scroll( + &self, + _app: AppSelector, + _params: InteractiveScrollParams, + ) -> BitFunResult { + Err(BitFunError::tool( + "interactive_scroll is not available on this host.".to_string(), + )) + } + + /// Whether this host can build a generic visual mark view for arbitrary + /// non-AX/non-OCR surfaces. Unlike [`Self::build_interactive_view`], this + /// does not require accessibility nodes; it marks candidate points in the + /// screenshot itself. + fn supports_visual_mark_view(&self) -> bool { + false + } + + async fn build_visual_mark_view( + &self, + _app: AppSelector, + _opts: VisualMarkViewOpts, + ) -> BitFunResult { + Err(BitFunError::tool( + "build_visual_mark_view is not available on this host.".to_string(), + )) + } + + async fn visual_click( + &self, + _app: AppSelector, + _params: VisualClickParams, + ) -> BitFunResult { + Err(BitFunError::tool( + "visual_click is not available on this host.".to_string(), + )) + } +} + +// ===================================================================== +// Codex-style AX-first data types (Phase 1: surface-only definitions). +// ===================================================================== + +/// Identifies a target application for the Codex-style `app_*` actions. +/// At least one of `name` / `bundle_id` / `pid` must be set; hosts pick +/// the most specific available (pid > bundle_id > name). +#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)] +pub struct AppSelector { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub name: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub bundle_id: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub pid: Option, +} + +impl AppSelector { + /// Convenience: select by name only (e.g. `"Safari"`). + pub fn by_name(name: impl Into) -> Self { + Self { + name: Some(name.into()), + bundle_id: None, + pid: None, + } + } + + /// Convenience: select by pid only. + pub fn by_pid(pid: i32) -> Self { + Self { + name: None, + bundle_id: None, + pid: Some(pid), + } + } + + /// Convenience: select by bundle id (macOS). + pub fn by_bundle_id(bundle_id: impl Into) -> Self { + Self { + name: None, + bundle_id: Some(bundle_id.into()), + pid: None, + } + } + + /// True when no selector field is populated. + pub fn is_empty(&self) -> bool { + self.name.is_none() && self.bundle_id.is_none() && self.pid.is_none() + } +} + +/// One running application, returned by [`ComputerUseHost::list_apps`]. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct AppInfo { + pub name: String, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub bundle_id: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub pid: Option, + /// Whether the application currently has at least one running process. + pub running: bool, + /// Unix-epoch milliseconds of last user activation, when the host can + /// resolve it from LaunchServices / equivalent. Used for ordering. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub last_used_ms: Option, + /// Cumulative launch count, when the host can resolve it. + #[serde(default)] + pub launch_count: u64, +} + +/// One node of a Codex-style accessibility tree. +/// +/// Indices are dense and stable **within a single +/// [`AppStateSnapshot`]** — they are only valid until the next +/// `get_app_state` / `app_*` call, after which the host re-dumps the tree +/// and assigns fresh indices. Callers that need to chain mutations should +/// use the snapshot returned from the previous mutation as the new +/// addressing basis. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct AxNode { + /// Stable index inside this snapshot. Zero is the application root. + pub idx: u32, + /// Parent index, `None` for the root. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub parent_idx: Option, + /// Native role string (e.g. macOS AX `AXButton`). + pub role: String, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub title: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub value: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub description: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub identifier: Option, + pub enabled: bool, + pub focused: bool, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub selected: Option, + /// Frame in **global** pointer space: `(x, y, width, height)`. `None` + /// when the AX backend cannot resolve the position. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub frame_global: Option<(f64, f64, f64, f64)>, + /// Names of supported AX actions (e.g. `kAXPress`, `kAXShowMenu`). + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub actions: Vec, + /// Localized role description (`AXRoleDescription` on macOS), e.g. + /// "standard window", "close button", "scroll area", "HTML content", + /// "tab group". Codex-style renderers prefer this over [`Self::role`] + /// because it matches what a sighted user would call the element. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub role_description: Option, + /// Native AX subrole (e.g. `AXCloseButton`, `AXFullScreenButton`, + /// `AXMinimizeButton`, `AXSecureTextField`). Useful for button + /// disambiguation when `role` is generic. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub subrole: Option, + /// `AXHelp` / tooltip text — frequently the only place an icon-only + /// button explains itself. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub help: Option, + /// `AXURL` for `AXWebArea` / "HTML content" nodes (e.g. Tauri + /// `tauri://localhost`, Electron `file://…`, Safari pages). + #[serde(default, skip_serializing_if = "Option::is_none")] + pub url: Option, + /// `AXExpanded` for disclosure controls / collapsible sidebars. + /// `Some(true)` = expanded, `Some(false)` = collapsed, `None` = + /// attribute not exposed by the element. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub expanded: Option, +} + +/// Snapshot of an application's AX tree. Returned by +/// [`ComputerUseHost::get_app_state`] and as the after-state of every +/// `app_*` mutation so the model can verify changes in one round-trip. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct AppStateSnapshot { + /// Identity of the captured application. + pub app: AppInfo, + /// Title of the focused window when `focus_window_only=true`, else + /// the frontmost-window title (best effort). + #[serde(default, skip_serializing_if = "Option::is_none")] + pub window_title: Option, + /// Codex-style human-readable text rendering of the tree (used in the + /// model prompt). Indices in `tree_text` match `nodes[i].idx`. + pub tree_text: String, + /// Structured nodes, dense indexing. + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub nodes: Vec, + /// Stable digest of the snapshot (lowercase hex SHA1 of the canonical + /// node payload). Used as `before_app_state_digest` to detect "no-op" + /// mutations and as a cheap equality check between successive + /// snapshots. + pub digest: String, + /// Unix-epoch milliseconds when the snapshot was captured. + pub captured_at_ms: u64, + /// **Auto-attached** focused-window screenshot (Codex parity). The host + /// captures the visible pixels of the target app's frontmost window + /// every time `get_app_state` (or any `app_*` mutation) returns, so + /// the model is never blind on canvas / WebView / WebGL surfaces that + /// the AX tree cannot describe (e.g. the Gobang board). `None` only + /// when the host explicitly opted out (e.g. inner `app_wait_for` + /// polls) or the capture itself failed. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub screenshot: Option, + /// Optional per-snapshot warning emitted by the host when it detects + /// the agent is targeting the same node / coordinate repeatedly without + /// progress. The recommended remediation is encoded directly in the + /// message and the model is expected to switch tactic (take a real + /// `screenshot`, fall back to keyboard, re-locate via OCR, …) on the + /// **very next** turn rather than retry the failing target. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub loop_warning: Option, +} + +// ===================================================================== +// Interactive-View (Set-of-Mark) data types — TuriX-CUA inspired. +// ===================================================================== + +/// Options for [`ComputerUseHost::build_interactive_view`]. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct InteractiveViewOpts { + /// When `true` (default) only emit elements inside the focused window + /// of the target application; when `false` emit every interactive + /// element across all windows of the app (heavier overlay). + #[serde(default = "default_focus_window_only_true")] + pub focus_window_only: bool, + /// Maximum number of interactive elements to include / annotate. The + /// host trims by visual area (largest first) when exceeded so the + /// overlay stays legible. `None` → host default (typically ~80). + #[serde(default, skip_serializing_if = "Option::is_none")] + pub max_elements: Option, + /// When `true` (default), the host paints numbered coloured boxes on a + /// fresh focused-window screenshot. Set `false` to skip the overlay + /// (text-only payload — cheaper, useful for retries / loop probes). + #[serde(default = "default_annotate_true")] + pub annotate_screenshot: bool, + /// When `true` (default), include the compact `tree_text` rendering of + /// the filtered elements alongside the structured `elements` array. + #[serde(default = "default_include_tree_text_true")] + pub include_tree_text: bool, +} + +fn default_focus_window_only_true() -> bool { + true +} +fn default_annotate_true() -> bool { + true +} +fn default_include_tree_text_true() -> bool { + true +} + +impl Default for InteractiveViewOpts { + fn default() -> Self { + Self { + focus_window_only: true, + max_elements: None, + annotate_screenshot: true, + include_tree_text: true, + } + } +} + +/// One interactive element inside an [`InteractiveView`]. The [`Self::i`] +/// field is the only handle the model is expected to use — every other +/// field is informational so the model can disambiguate between visually +/// similar boxes. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct InteractiveElement { + /// Dense per-view index (0-based). The single source of truth the + /// model passes back via [`ClickIndexTarget::Index`] / + /// [`InteractiveClickParams::i`]. + pub i: u32, + /// Underlying [`AxNode::idx`] in the snapshot embedded in this view. + /// Hosts use this to round-trip back to existing `app_click` / + /// `app_type_text` plumbing. + pub node_idx: u32, + /// Native AX role (`AXButton`, `AXTextField`, …). The overlay colour + /// is derived from this. + pub role: String, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub subrole: Option, + /// Best human-readable label for the element (title → description → + /// help → value, whichever is non-empty first). + #[serde(default, skip_serializing_if = "Option::is_none")] + pub label: Option, + /// Frame in **JPEG image pixel** space of the overlay screenshot + /// (`x, y, width, height`). When `annotate_screenshot=false` the host + /// may return `None` for elements outside the captured window. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub frame_image: Option<(u32, u32, u32, u32)>, + /// Frame in **global pointer** space (`x, y, width, height`). Useful + /// for hosts that need a coordinate fallback when AX press fails. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub frame_global: Option<(f64, f64, f64, f64)>, + /// `true` when the element is focusable / actionable right now. + #[serde(default = "default_true")] + pub enabled: bool, + #[serde(default, skip_serializing_if = "is_false")] + pub focused: bool, + /// Whether the host can dispatch a press via AX (vs. falling back to a + /// pointer click). + #[serde(default = "default_true")] + pub ax_actionable: bool, +} + +fn default_true() -> bool { + true +} + +/// Set-of-Mark interactive snapshot returned by +/// [`ComputerUseHost::build_interactive_view`]. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct InteractiveView { + /// Identity of the captured application. + pub app: AppInfo, + /// Title of the focused window (or `None` when the host could not + /// resolve it). + #[serde(default, skip_serializing_if = "Option::is_none")] + pub window_title: Option, + /// Filtered + sorted interactive elements with dense `i` indices. + pub elements: Vec, + /// Compact text rendering of `elements` (one element per line, prefixed + /// with `[i] role "label"`). Empty string when + /// `opts.include_tree_text=false`. + #[serde(default, skip_serializing_if = "String::is_empty")] + pub tree_text: String, + /// Stable lowercase-hex SHA1 over the canonical element payload. + /// Subsequent `interactive_*` calls echo this back as + /// `before_view_digest` so the host can detect "stale index" usage. + pub digest: String, + /// Unix-epoch milliseconds when the view was captured. + pub captured_at_ms: u64, + /// Annotated focused-window screenshot (numbered coloured boxes). + /// `None` when `opts.annotate_screenshot=false` or the capture failed. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub screenshot: Option, + /// Loop / no-progress warning, mirrored from + /// [`AppStateSnapshot::loop_warning`]. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub loop_warning: Option, +} + +/// Where an [`ComputerUseHost::interactive_click`] should land. `Index` +/// is the canonical addressing mode; the other variants exist only so +/// hosts can transparently fall back to existing `app_click` paths when +/// AX press is rejected for a given element. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[serde(rename_all = "snake_case", tag = "kind")] +pub enum ClickIndexTarget { + /// `i` value from [`InteractiveElement::i`]. + Index { i: u32 }, + /// Authoritative AX node index (used internally when the host falls + /// back from a stale interactive index). + NodeIdx { idx: u32 }, +} + +/// Parameters for [`ComputerUseHost::interactive_click`]. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct InteractiveClickParams { + /// Required: the `i` index from the most recent interactive view. + pub i: u32, + /// Echo of [`InteractiveView::digest`] so the host can detect stale + /// indices when the UI changed between view + click. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub before_view_digest: Option, + #[serde(default = "default_click_count_one")] + pub click_count: u8, + /// `"left"` / `"right"` / `"middle"`. + #[serde(default = "default_left_button")] + pub mouse_button: String, + /// Modifier names (e.g. `["command"]`). + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub modifier_keys: Vec, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub wait_ms_after: Option, + /// Whether the host should re-build the interactive view after the + /// click (default `true` — the model gets a fresh annotated screenshot + /// for the next turn). Set `false` when chaining many `interactive_*` + /// calls in a row to save on overlay rendering. + #[serde(default = "default_true")] + pub return_view: bool, +} + +fn default_click_count_one() -> u8 { + 1 +} +fn default_left_button() -> String { + "left".to_string() +} + +/// Parameters for [`ComputerUseHost::interactive_type_text`]. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct InteractiveTypeTextParams { + /// `i` index of the text field. `None` types into whatever element is + /// currently focused. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub i: Option, + pub text: String, + /// When `true`, host clears the field via `cmd+a` + `delete` (macOS) + /// or equivalent before typing. + #[serde(default, skip_serializing_if = "is_false")] + pub clear_first: bool, + /// When `true`, host presses `return` after typing. + #[serde(default, skip_serializing_if = "is_false")] + pub press_enter_after: bool, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub before_view_digest: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub wait_ms_after: Option, + #[serde(default = "default_true")] + pub return_view: bool, +} + +/// Parameters for [`ComputerUseHost::interactive_scroll`]. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct InteractiveScrollParams { + /// `i` index of the scroll target. `None` scrolls at pointer / focused + /// window centre. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub i: Option, + /// Vertical scroll amount in lines / "wheel ticks" (positive = down). + #[serde(default)] + pub dy: i32, + /// Horizontal scroll amount in lines / "wheel ticks" (positive = right). + #[serde(default)] + pub dx: i32, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub before_view_digest: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub wait_ms_after: Option, + #[serde(default = "default_true")] + pub return_view: bool, +} + +/// Result envelope for `interactive_*` actions. Always carries the bare +/// AX snapshot; the rendered [`InteractiveView`] is only populated when +/// the caller asked for it via `return_view=true`. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct InteractiveActionResult { + pub snapshot: AppStateSnapshot, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub view: Option, + /// Best-effort note about how the host actually executed the request + /// (e.g. `"ax_press"`, `"pointer_click_fallback"`, + /// `"index_resolved_via_node_idx"`). + #[serde(default, skip_serializing_if = "Option::is_none")] + pub execution_note: Option, +} + +/// Options for generic visual marking. This is intentionally UI-agnostic: +/// hosts should produce useful candidate points even when AX/OCR exposes +/// nothing, such as Canvas, games, maps, drawings, and icon-only controls. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct VisualMarkViewOpts { + /// Max candidate points to emit. Default keeps the overlay readable. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub max_points: Option, + /// Optional region in screenshot image pixels to mark. When omitted, + /// the host marks the whole app screenshot. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub region: Option, + /// Include regular grid points. Default true. + #[serde(default = "default_true")] + pub include_grid: bool, +} + +impl Default for VisualMarkViewOpts { + fn default() -> Self { + Self { + max_points: None, + region: None, + include_grid: true, + } + } +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct VisualImageRegion { + pub x0: u32, + pub y0: u32, + pub width: u32, + pub height: u32, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct VisualMark { + pub i: u32, + pub x: i32, + pub y: i32, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub frame_image: Option<(u32, u32, u32, u32)>, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub label: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct VisualMarkView { + pub app: AppInfo, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub window_title: Option, + pub marks: Vec, + pub digest: String, + pub captured_at_ms: u64, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub screenshot: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct VisualClickParams { + pub i: u32, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub before_view_digest: Option, + #[serde(default = "default_click_count_one")] + pub click_count: u8, + #[serde(default = "default_left_button")] + pub mouse_button: String, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub modifier_keys: Vec, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub wait_ms_after: Option, + #[serde(default = "default_true")] + pub return_view: bool, +} + +/// Result envelope for `visual_*` actions. This mirrors +/// [`InteractiveActionResult`], but carries a [`VisualMarkView`] because the +/// addressing basis is screenshot marks rather than AX elements. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct VisualActionResult { + pub snapshot: AppStateSnapshot, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub view: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub execution_note: Option, +} + +/// Where an [`ComputerUseHost::app_click`] should land. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[serde(rename_all = "snake_case", tag = "kind")] +pub enum ClickTarget { + /// Global screen-space coordinates (same space as `mouse_move`). + ScreenXy { x: f64, y: f64 }, + /// Pixel coordinates in the most recent screenshot attached by + /// `get_app_state` / `screenshot`. This is the preferred target for + /// visual surfaces such as Canvas, SVG boards, and WebGL scenes. + ImageXy { + x: i32, + y: i32, + #[serde(default, skip_serializing_if = "Option::is_none")] + screenshot_id: Option, + }, + /// Grid target inside the most recent screenshot attached by + /// `get_app_state` / `app_click`. This is for non-text visual surfaces + /// such as boards and canvases where a single guessed pixel is brittle. + /// + /// `x0/y0/width/height` describe the board/grid rectangle in screenshot + /// image pixels. `row` and `col` are zero-based. When `intersections` is + /// true, rows/cols are line intersections (e.g. Go/Gomoku 15x15); when + /// false, rows/cols are cells and the click lands in the cell center. + ImageGrid { + x0: i32, + y0: i32, + width: u32, + height: u32, + rows: u32, + cols: u32, + row: u32, + col: u32, + #[serde(default)] + intersections: bool, + #[serde(default, skip_serializing_if = "Option::is_none")] + screenshot_id: Option, + }, + /// Self-locating regular visual grid target. The host captures the app + /// screenshot, detects a regular line grid, then clicks the requested + /// row/col in the detected grid. Use when the surface is custom-drawn and + /// the grid rectangle is not exposed by AX/OCR. + VisualGrid { + rows: u32, + cols: u32, + row: u32, + col: u32, + #[serde(default)] + intersections: bool, + #[serde(default, skip_serializing_if = "Option::is_none")] + wait_ms_after_detection: Option, + }, + /// AX node addressed by index inside the most recent + /// [`AppStateSnapshot`] for this app. + NodeIdx { idx: u32 }, + /// OCR text needle: the host screenshots the target app, runs OCR, + /// and clicks the centre of the highest-confidence match. Used as a + /// fallback when the AX tree does not expose the desired element + /// (e.g. inside a Canvas / WebGL / custom-drawn surface). + OcrText { needle: String }, +} + +/// Parameters for [`ComputerUseHost::app_click`]. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct AppClickParams { + pub app: AppSelector, + pub target: ClickTarget, + /// Number of clicks (1 = single, 2 = double, 3 = triple). + #[serde(default = "AppClickParams::default_click_count")] + pub click_count: u8, + /// `"left"` / `"right"` / `"middle"`. + #[serde(default = "AppClickParams::default_button")] + pub mouse_button: String, + /// Modifier names held during the click (e.g. `["command"]`). + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub modifier_keys: Vec, + /// Optional settle delay before returning the after-state screenshot. + /// Useful for game boards, WebViews, animations, and delayed AI moves. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub wait_ms_after: Option, +} + +impl AppClickParams { + fn default_click_count() -> u8 { + 1 + } + fn default_button() -> String { + "left".to_string() + } +} + +/// Predicate for [`ComputerUseHost::app_wait_for`]. +/// +/// Hosts that don't yet implement AX waiting can simply return the +/// `app_wait_for is not available` default error; consumers fall back to +/// `wait_ms` + `get_app_state`. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[serde(rename_all = "snake_case", tag = "kind")] +pub enum AppWaitPredicate { + /// Wait until the AX tree digest changes from `prev_digest`. + DigestChanged { prev_digest: String }, + /// Wait until any node's `title` contains the given substring. + TitleContains { needle: String }, + /// Wait until any node has the given role and `enabled == true`. + RoleEnabled { role: String }, + /// Wait until the node identified by `idx` reports `enabled=true`. + NodeEnabled { idx: u32 }, } /// One physical display reported by the desktop host. Returned by @@ -719,4 +1589,223 @@ mod tests { serde_json::json!(true) ); } + + #[test] + fn app_selector_constructors_populate_only_one_field() { + let by_name = AppSelector::by_name("Safari"); + assert_eq!(by_name.name.as_deref(), Some("Safari")); + assert!(by_name.bundle_id.is_none() && by_name.pid.is_none()); + assert!(!by_name.is_empty()); + + let empty = AppSelector::default(); + assert!(empty.is_empty()); + } + + #[test] + fn click_target_serializes_with_kind_tag() { + let xy = ClickTarget::ScreenXy { x: 10.5, y: 20.0 }; + let v = serde_json::to_value(&xy).expect("serialize ScreenXy"); + assert_eq!(v["kind"], "screen_xy"); + assert_eq!(v["x"], serde_json::json!(10.5)); + + let image_xy = ClickTarget::ImageXy { + x: 100, + y: 200, + screenshot_id: Some("shot_1".to_string()), + }; + let v = serde_json::to_value(&image_xy).expect("serialize ImageXy"); + assert_eq!(v["kind"], "image_xy"); + assert_eq!(v["x"], serde_json::json!(100)); + assert_eq!(v["screenshot_id"], serde_json::json!("shot_1")); + + let grid = ClickTarget::ImageGrid { + x0: 10, + y0: 20, + width: 300, + height: 300, + rows: 15, + cols: 15, + row: 7, + col: 7, + intersections: true, + screenshot_id: Some("shot_1".to_string()), + }; + let v = serde_json::to_value(&grid).expect("serialize ImageGrid"); + assert_eq!(v["kind"], "image_grid"); + assert_eq!(v["intersections"], serde_json::json!(true)); + + let visual_grid = ClickTarget::VisualGrid { + rows: 15, + cols: 15, + row: 7, + col: 7, + intersections: true, + wait_ms_after_detection: None, + }; + let v = serde_json::to_value(&visual_grid).expect("serialize VisualGrid"); + assert_eq!(v["kind"], "visual_grid"); + assert_eq!(v["rows"], serde_json::json!(15)); + + let node = ClickTarget::NodeIdx { idx: 7 }; + let v = serde_json::to_value(&node).expect("serialize NodeIdx"); + assert_eq!(v["kind"], "node_idx"); + assert_eq!(v["idx"], serde_json::json!(7)); + + let round_trip: ClickTarget = + serde_json::from_value(v).expect("deserialize node_idx click target"); + assert_eq!(round_trip, ClickTarget::NodeIdx { idx: 7 }); + } + + #[test] + fn app_click_params_apply_defaults_on_deserialize() { + let json = serde_json::json!({ + "app": { "name": "Safari" }, + "target": { "kind": "node_idx", "idx": 3 }, + }); + let parsed: AppClickParams = + serde_json::from_value(json).expect("deserialize minimal AppClickParams"); + assert_eq!(parsed.click_count, 1); + assert_eq!(parsed.mouse_button, "left"); + assert!(parsed.modifier_keys.is_empty()); + assert_eq!(parsed.wait_ms_after, None); + assert_eq!(parsed.app.name.as_deref(), Some("Safari")); + assert_eq!(parsed.target, ClickTarget::NodeIdx { idx: 3 }); + } + + #[test] + fn interactive_view_opts_apply_defaults_on_minimal_json() { + let parsed: InteractiveViewOpts = + serde_json::from_value(serde_json::json!({})).expect("deserialize empty opts"); + assert!(parsed.focus_window_only); + assert!(parsed.annotate_screenshot); + assert!(parsed.include_tree_text); + assert_eq!(parsed.max_elements, None); + } + + #[test] + fn interactive_view_round_trips() { + let view = InteractiveView { + app: AppInfo { + name: "Safari".into(), + bundle_id: Some("com.apple.Safari".into()), + pid: Some(123), + running: true, + last_used_ms: None, + launch_count: 0, + }, + window_title: Some("Apple".into()), + elements: vec![InteractiveElement { + i: 0, + node_idx: 17, + role: "AXButton".into(), + subrole: Some("AXCloseButton".into()), + label: Some("Close".into()), + frame_image: Some((10, 20, 30, 40)), + frame_global: Some((11.0, 21.0, 30.0, 40.0)), + enabled: true, + focused: false, + ax_actionable: true, + }], + tree_text: "[0] AXButton \"Close\"".into(), + digest: "abc123".into(), + captured_at_ms: 1700000000000, + screenshot: None, + loop_warning: None, + }; + let v = serde_json::to_value(&view).expect("serialize view"); + assert_eq!(v["digest"], "abc123"); + assert_eq!(v["elements"][0]["i"], 0); + assert_eq!(v["elements"][0]["node_idx"], 17); + let back: InteractiveView = serde_json::from_value(v).expect("deserialize view"); + assert_eq!(back, view); + } + + #[test] + fn click_index_target_serializes_with_kind_tag() { + let by_idx = ClickIndexTarget::Index { i: 5 }; + let v = serde_json::to_value(&by_idx).expect("serialize"); + assert_eq!(v["kind"], "index"); + assert_eq!(v["i"], 5); + let back: ClickIndexTarget = serde_json::from_value(v).expect("deserialize"); + assert_eq!(back, ClickIndexTarget::Index { i: 5 }); + + let by_node = ClickIndexTarget::NodeIdx { idx: 9 }; + let v = serde_json::to_value(&by_node).expect("serialize"); + assert_eq!(v["kind"], "node_idx"); + assert_eq!(v["idx"], 9); + } + + #[test] + fn interactive_click_params_apply_defaults() { + let parsed: InteractiveClickParams = serde_json::from_value(serde_json::json!({"i": 3})) + .expect("deserialize minimal click params"); + assert_eq!(parsed.i, 3); + assert_eq!(parsed.click_count, 1); + assert_eq!(parsed.mouse_button, "left"); + assert!(parsed.modifier_keys.is_empty()); + assert!(parsed.return_view); + } + + #[test] + fn visual_mark_params_apply_defaults() { + let opts: VisualMarkViewOpts = + serde_json::from_value(serde_json::json!({})).expect("deserialize minimal opts"); + assert_eq!(opts.max_points, None); + assert_eq!(opts.region, None); + assert!(opts.include_grid); + + let click: VisualClickParams = serde_json::from_value(serde_json::json!({"i": 5})) + .expect("deserialize minimal visual click params"); + assert_eq!(click.i, 5); + assert_eq!(click.click_count, 1); + assert_eq!(click.mouse_button, "left"); + assert!(click.modifier_keys.is_empty()); + assert!(click.return_view); + } + + #[test] + fn interactive_type_text_params_round_trip() { + let params = InteractiveTypeTextParams { + i: Some(7), + text: "hello".into(), + clear_first: true, + press_enter_after: true, + before_view_digest: Some("d".into()), + wait_ms_after: Some(100), + return_view: true, + }; + let v = serde_json::to_value(¶ms).expect("serialize"); + let back: InteractiveTypeTextParams = serde_json::from_value(v).expect("deserialize"); + assert_eq!(back, params); + } + + #[test] + fn interactive_scroll_params_apply_defaults() { + let parsed: InteractiveScrollParams = serde_json::from_value(serde_json::json!({})) + .expect("deserialize minimal scroll params"); + assert_eq!(parsed.i, None); + assert_eq!(parsed.dx, 0); + assert_eq!(parsed.dy, 0); + assert!(parsed.return_view); + } + + #[test] + fn app_wait_predicate_round_trips_each_variant() { + for pred in [ + AppWaitPredicate::DigestChanged { + prev_digest: "abc".to_string(), + }, + AppWaitPredicate::TitleContains { + needle: "Save".to_string(), + }, + AppWaitPredicate::RoleEnabled { + role: "AXButton".to_string(), + }, + AppWaitPredicate::NodeEnabled { idx: 12 }, + ] { + let v = serde_json::to_value(&pred).expect("serialize predicate"); + let back: AppWaitPredicate = serde_json::from_value(v).expect("deserialize predicate"); + assert_eq!(back, pred); + } + } } diff --git a/src/crates/core/src/agentic/tools/implementations/computer_use_locate.rs b/src/crates/core/src/agentic/tools/implementations/computer_use_locate.rs index d97675200..193f30ffd 100644 --- a/src/crates/core/src/agentic/tools/implementations/computer_use_locate.rs +++ b/src/crates/core/src/agentic/tools/implementations/computer_use_locate.rs @@ -64,6 +64,18 @@ pub(crate) async fn execute_computer_use_locate( .get("filter_combine") .and_then(|v| v.as_str()) .map(|s| s.to_string()), + text_contains: input + .get("text_contains") + .and_then(|v| v.as_str()) + .map(|s| s.to_string()), + node_idx: input + .get("node_idx") + .and_then(|v| v.as_u64()) + .map(|v| v as u32), + app_state_digest: input + .get("app_state_digest") + .and_then(|v| v.as_str()) + .map(|s| s.to_string()), }; let input_coords = json!({ @@ -71,6 +83,9 @@ pub(crate) async fn execute_computer_use_locate( "title_contains": query.title_contains.clone(), "role_substring": query.role_substring.clone(), "identifier_contains": query.identifier_contains.clone(), + "text_contains": query.text_contains.clone(), + "node_idx": query.node_idx, + "app_state_digest": query.app_state_digest.clone(), "max_depth": query.max_depth, "filter_combine": query.filter_combine.clone(), }); diff --git a/src/crates/core/src/agentic/tools/implementations/computer_use_result.rs b/src/crates/core/src/agentic/tools/implementations/computer_use_result.rs index 820043c2c..b542cb176 100644 --- a/src/crates/core/src/agentic/tools/implementations/computer_use_result.rs +++ b/src/crates/core/src/agentic/tools/implementations/computer_use_result.rs @@ -89,6 +89,7 @@ mod tests { #[test] fn screenshot_body_keeps_existing_fields_and_adds_interaction_state() { let shot = ComputerScreenshot { + screenshot_id: Some("test-shot".to_string()), bytes: vec![1, 2, 3], mime_type: "image/jpeg".to_string(), image_width: 100, @@ -110,6 +111,7 @@ mod tests { width: 98, height: 76, }), + image_global_bounds: None, implicit_confirmation_crop_applied: false, ui_tree_text: None, }; diff --git a/src/crates/core/src/agentic/tools/implementations/computer_use_tool.rs b/src/crates/core/src/agentic/tools/implementations/computer_use_tool.rs index 7998b8467..805a74bc8 100644 --- a/src/crates/core/src/agentic/tools/implementations/computer_use_tool.rs +++ b/src/crates/core/src/agentic/tools/implementations/computer_use_tool.rs @@ -106,9 +106,9 @@ The **primary model cannot consume images** in tool results — **do not** use * **ACTION PRIORITY (CRITICAL):** Always think in this order:\n\ 1. **Terminal/CLI/System commands first** — Use Bash tool for terminal commands, system scripts (e.g., macOS `osascript`), shell automation. Most efficient.\n\ 2. **Keyboard shortcuts second** — Use **`key_chord`** / **`type_text`** for system/app shortcuts, navigation keys.\n\ -3. **Precise UI control last** — Only when above fail: **`click_element`** (AX) → **`move_to_text`** (OCR, use **`move_to_text_match_index`** from text `candidates` when multiple hits) → **`mouse_move`** (**`use_screen_coordinates`: true** with **`global_center_*`** / **`locate`** / **`pointer_global`**) → **`click`**.\n\ +3. **Precise UI control last** — Only when above fail: **`click_target`** / **`move_to_target`** (AX → OCR → screen coords in one call) → lower-level **`click_element`** / **`move_to_text`** → **`mouse_move`** + **`click`**.\n\ **Rhythm:** one action at a time; use **`wait`** when UI animates. Observe **`interaction_state`** and **`computer_use_context`** in tool JSON.\n\ -**`click_element` / `locate`:** Accessibility (AX/UIA/AT-SPI). **`move_to_text`:** OCR match + move pointer only. **`click`:** at current pointer only — use **`mouse_move`** or **`move_to_text`** / **`click_element`** first.\n\ +**`click_target` / `move_to_target`:** Unified resolver: AX filters or `target_text` first, OCR second, explicit global x/y last. **`click_element` / `locate`:** Accessibility (AX/UIA/AT-SPI). **`move_to_text`:** OCR match + move pointer only. **`click`:** at current pointer only — use **`mouse_move`** or **`move_to_text`** / **`click_element`** first.\n\ **`mouse_move` / `drag`:** **`use_screen_coordinates`: true** with globals from tools. **`pointer_move_rel`:** relative nudge; host may block right after certain flows — follow tool errors.\n\ **`key_chord` / `type_text` / `scroll` / `wait`:** standard desktop automation without any screenshot step.\n", os, keys @@ -122,8 +122,8 @@ The **primary model cannot consume images** in tool results — **do not** use * "properties": { "action": { "type": "string", - "enum": ["click_element", "move_to_text", "click", "mouse_move", "scroll", "drag", "locate", "key_chord", "type_text", "pointer_move_rel", "wait", "open_app", "run_apple_script"], - "description": "The action to perform. **Primary model is text-only — no `screenshot`.** **ACTION PRIORITY:** 1) Use Bash tool for CLI/terminal/system commands first. 2) **`open_app`** to launch apps. **`run_apple_script`** for AppleScript (macOS). 3) Prefer `key_chord` for shortcuts/navigation. 4) Only when above fail: `click_element` (AX) → `move_to_text` (OCR, use `move_to_text_match_index` when multiple hits listed) → `mouse_move` (**`use_screen_coordinates`: true** with globals) + `click`. Never guess coordinates." + "enum": ["click_target", "move_to_target", "click_element", "move_to_text", "click", "mouse_move", "scroll", "drag", "locate", "key_chord", "type_text", "pointer_move_rel", "wait", "open_app", "run_apple_script"], + "description": "The action to perform. **Primary model is text-only — no `screenshot`.** **ACTION PRIORITY:** 1) Use Bash tool for CLI/terminal/system commands first. 2) **`open_app`** to launch apps. **`run_apple_script`** for AppleScript (macOS). 3) Prefer `key_chord` for shortcuts/navigation. 4) Only when above fail: `click_target` / `move_to_target` (AX → OCR → screen coords in one call), then lower-level `click_element`, `move_to_text`, or `mouse_move` + `click`. Never guess coordinates." }, "x": { "type": "integer", "description": "For `mouse_move` and `drag`: X in **global display** units when **`use_screen_coordinates`: true** (required). **Not** for `click`." }, "y": { "type": "integer", "description": "For `mouse_move` and `drag`: Y in **global display** units when **`use_screen_coordinates`: true** (required). **Not** for `click`." }, @@ -131,8 +131,8 @@ The **primary model cannot consume images** in tool results — **do not** use * "use_screen_coordinates": { "type": "boolean", "description": "For `mouse_move`, `drag`: **must be true** — global display coordinates from `move_to_text`, `locate`, AX, or `pointer_global`. **Not** for `click`." }, "button": { "type": "string", "enum": ["left", "right", "middle"], "description": "For `click`, `click_element`, `drag`: mouse button (default left)." }, "num_clicks": { "type": "integer", "minimum": 1, "maximum": 3, "description": "For `click`, `click_element`: 1=single (default), 2=double, 3=triple click." }, - "delta_x": { "type": "integer", "description": "For `pointer_move_rel`: horizontal delta (negative=left). For `scroll`: horizontal wheel delta." }, - "delta_y": { "type": "integer", "description": "For `pointer_move_rel`: vertical delta (negative=up). For `scroll`: vertical wheel delta." }, + "delta_x": { "type": "integer", "description": "For `pointer_move_rel`: horizontal delta (negative=left); also accepted as `dx`. For `scroll`: horizontal wheel delta." }, + "delta_y": { "type": "integer", "description": "For `pointer_move_rel`: vertical delta (negative=up); also accepted as `dy`. For `scroll`: vertical wheel delta." }, "start_x": { "type": "integer", "description": "For `drag`: start X coordinate." }, "start_y": { "type": "integer", "description": "For `drag`: start Y coordinate." }, "end_x": { "type": "integer", "description": "For `drag`: end X coordinate." }, @@ -140,8 +140,10 @@ The **primary model cannot consume images** in tool results — **do not** use * "keys": { "type": "array", "items": { "type": "string" }, "description": "For `key_chord`: keys in order — modifiers first, then the main key. Desktop host waits after pressing modifiers so shortcuts register (important on macOS with IME)." }, "text": { "type": "string", "description": "For `type_text`: text to type. Prefer clipboard paste (key_chord) for long content." }, "ms": { "type": "integer", "description": "For `wait`: duration in milliseconds." }, - "text_query": { "type": "string", "description": "For `move_to_text`: visible text to OCR-match on screen (case-insensitive substring)." }, - "move_to_text_match_index": { "type": "integer", "minimum": 1, "description": "For `move_to_text`: **1-based** index from `candidates[].match_index` after disambiguation (multiple OCR hits). Omit on the first pass; set when choosing which hit to move to." }, + "target_text": { "type": "string", "description": "For `move_to_target` / `click_target`: visible or accessible text. The resolver tries AX first, then OCR." }, + "target_match_index": { "type": "integer", "minimum": 1, "description": "For `move_to_target` / `click_target`: optional 1-based OCR match index when you want a specific candidate." }, + "text_query": { "type": "string", "description": "For `move_to_text`, `move_to_target`, `click_target`: visible text to OCR-match on screen (case-insensitive substring)." }, + "move_to_text_match_index": { "type": "integer", "minimum": 1, "description": "For `move_to_text` and unified target actions: **1-based** OCR match index." }, "ocr_region_native": { "type": "object", "description": "For `move_to_text`: optional global native rectangle for OCR. If omitted, macOS uses the frontmost window bounds from Accessibility; other OSes use the primary display.", @@ -152,11 +154,14 @@ The **primary model cannot consume images** in tool results — **do not** use * "height": { "type": "integer", "minimum": 1, "description": "Height in the same coordinate unit as x0/y0." } } }, - "title_contains": { "type": "string", "description": "For `locate`, `click_element`: case-insensitive substring match on accessible title (AXTitle)." }, - "role_substring": { "type": "string", "description": "For `locate`, `click_element`: case-insensitive substring on AXRole." }, + "title_contains": { "type": "string", "description": "For `locate`, `click_element`: case-insensitive substring on AXTitle ONLY. Prefer `text_contains` (also covers AXValue/AXDescription/AXHelp)." }, + "role_substring": { "type": "string", "description": "For `locate`, `click_element`: case-insensitive substring on AXRole **or AXSubrole** (e.g. \"Button\", \"SearchField\")." }, "identifier_contains": { "type": "string", "description": "For `locate`, `click_element`: case-insensitive substring on AXIdentifier." }, - "max_depth": { "type": "integer", "minimum": 1, "maximum": 200, "description": "For `locate`, `click_element`: max BFS depth (default 48)." }, - "filter_combine": { "type": "string", "enum": ["all", "any"], "description": "For `locate`, `click_element`: `all` (default, AND) or `any` (OR) for filter combination." }, + "text_contains": { "type": "string", "description": "For `locate`, `click_element`: case-insensitive substring matched against ANY of AXTitle / AXValue / AXDescription / AXHelp. Prefer this when the visible text is shown via value/description (e.g. AXStaticText cards) instead of title." }, + "node_idx": { "type": "integer", "minimum": 0, "description": "For `locate`, `click_element`: jump straight to a node returned by the most recent `desktop.get_app_state` (field `idx`). Bypasses BFS. macOS only; other platforms return AX_IDX_NOT_SUPPORTED." }, + "app_state_digest": { "type": "string", "description": "For `locate`, `click_element`: optional `state_digest` from the same `get_app_state` call that produced `node_idx`. Stale digest yields AX_IDX_STALE so you re-snapshot." }, + "max_depth": { "type": "integer", "minimum": 1, "maximum": 200, "description": "For `locate`, `click_element`: max BFS depth (default 48). Ignored when `node_idx` is supplied." }, + "filter_combine": { "type": "string", "enum": ["all", "any"], "description": "For `locate`, `click_element`: `all` (default, AND) or `any` (OR) for filter combination. Priority: `node_idx` > `text_contains` > `title_contains`+`role_substring`." }, "app_name": { "type": "string", "description": "For `open_app`: the application name to launch." }, "script": { "type": "string", "description": "For `run_apple_script`: the AppleScript code to execute. macOS only." }, "scroll_x": { "type": "integer", "description": "For `scroll`: optional global X coordinate to scroll at. Use with `scroll_y`." }, @@ -425,6 +430,169 @@ The **primary model cannot consume images** in tool results — **do not** use * .collect()) } + fn locate_query_has_any_target(query: &UiElementLocateQuery) -> bool { + query.node_idx.is_some() + || query.text_contains.is_some() + || query.title_contains.is_some() + || query.role_substring.is_some() + || query.identifier_contains.is_some() + } + + fn target_text_query<'a>(input: &'a Value, query: &'a UiElementLocateQuery) -> Option<&'a str> { + input + .get("target_text") + .and_then(|v| v.as_str()) + .map(str::trim) + .filter(|s| !s.is_empty()) + .or_else(|| { + input + .get("text_query") + .and_then(|v| v.as_str()) + .map(str::trim) + .filter(|s| !s.is_empty()) + }) + .or_else(|| { + query + .text_contains + .as_deref() + .map(str::trim) + .filter(|s| !s.is_empty()) + }) + .or_else(|| { + query + .title_contains + .as_deref() + .map(str::trim) + .filter(|s| !s.is_empty()) + }) + } + + async fn resolve_target_point( + host_ref: &dyn crate::agentic::tools::computer_use_host::ComputerUseHost, + input: &Value, + ) -> BitFunResult { + let mut query = parse_locate_query(input); + if query.text_contains.is_none() { + if let Some(target_text) = input + .get("target_text") + .and_then(|v| v.as_str()) + .map(str::trim) + .filter(|s| !s.is_empty()) + { + query.text_contains = Some(target_text.to_string()); + } + } + + let mut ax_error: Option = None; + if Self::locate_query_has_any_target(&query) { + match host_ref + .locate_ui_element_screen_center(query.clone()) + .await + { + Ok(res) => { + return Ok(ResolvedDesktopTarget { + source: "ax".to_string(), + x: res.global_center_x, + y: res.global_center_y, + matched_text: res.matched_title.clone(), + matched_role: Some(res.matched_role), + matched_identifier: res.matched_identifier, + total_matches: Some(res.total_matches.max(1)), + selected_match_index: Some(1), + warning: (res.total_matches > 1).then(|| { + format!( + "{} AX elements matched; selected the host-ranked best match.", + res.total_matches + ) + }), + ax_error: None, + }); + } + Err(err) => { + ax_error = Some(err.to_string()); + } + } + } + + if let Some(text_query) = Self::target_text_query(input, &query) { + let ocr_region_native = parse_ocr_region_native(input)?; + let matches = + Self::find_text_on_screen(host_ref, text_query, ocr_region_native).await?; + if !matches.is_empty() { + let requested_index = input + .get("move_to_text_match_index") + .or_else(|| input.get("target_match_index")) + .and_then(|v| v.as_u64()) + .map(|u| u as usize); + let selected = match requested_index { + Some(idx) if idx >= 1 && idx <= matches.len() => idx - 1, + Some(idx) => { + return Err(BitFunError::tool(format!( + "target_match_index/move_to_text_match_index must be between 1 and {} (got {}).", + matches.len(), + idx + ))); + } + None => matches + .iter() + .enumerate() + .max_by(|(_, a), (_, b)| { + a.confidence + .partial_cmp(&b.confidence) + .unwrap_or(std::cmp::Ordering::Equal) + }) + .map(|(idx, _)| idx) + .unwrap_or(0), + }; + let m = &matches[selected]; + return Ok(ResolvedDesktopTarget { + source: "ocr".to_string(), + x: m.center_x, + y: m.center_y, + matched_text: Some(m.text.clone()), + matched_role: None, + matched_identifier: None, + total_matches: Some(matches.len() as u32), + selected_match_index: Some((selected + 1) as u32), + warning: (matches.len() > 1 && requested_index.is_none()).then(|| { + format!( + "{} OCR matches found for {:?}; selected the highest-confidence match. Pass target_match_index to pin another candidate.", + matches.len(), + text_query + ) + }), + ax_error, + }); + } + } + + if input.get("x").is_some() || input.get("y").is_some() { + ensure_pointer_move_uses_screen_coordinates_only(input)?; + let x = req_i32(input, "x")?; + let y = req_i32(input, "y")?; + let (sx64, sy64) = Self::resolve_xy_f64(host_ref, input, x, y)?; + if use_screen_coordinates(input) { + ensure_global_xy_on_display(host_ref, sx64, sy64).await?; + } + return Ok(ResolvedDesktopTarget { + source: "screen_xy".to_string(), + x: sx64, + y: sy64, + matched_text: None, + matched_role: None, + matched_identifier: None, + total_matches: None, + selected_match_index: None, + warning: None, + ax_error, + }); + } + + Err(BitFunError::tool( + "move_to_target/click_target requires a target: node_idx, target_text/text_query/text_contains/title_contains, role_substring, identifier_contains, or x/y with use_screen_coordinates: true.".to_string(), + )) + } + /// Writes the exact JPEG sent to the model (including pointer overlay) under the workspace for debugging. async fn try_save_screenshot_for_debug( bytes: &[u8], @@ -509,6 +677,8 @@ The **primary model cannot consume images** in tool results — **do not** use * "point_crop_half_extent_native": shot.point_crop_half_extent_native, "navigation_native_rect": shot.navigation_native_rect, "quadrant_navigation_click_ready": shot.quadrant_navigation_click_ready, + "image_content_rect": shot.image_content_rect, + "image_global_bounds": shot.image_global_bounds, "implicit_confirmation_crop_applied": shot.implicit_confirmation_crop_applied, "debug_screenshot_path": debug_rel, "ui_tree_text": shot.ui_tree_text, @@ -705,6 +875,56 @@ fn computer_use_snapshot_coordinate_basis( } } +/// Verify a global (gx, gy) coordinate falls within at least one display reported by +/// the host. Returns a structured `DESKTOP_COORD_OUT_OF_DISPLAY` error otherwise. +/// +/// This is the guard rail that prevents models from passing image-pixel coordinates +/// (taken from a screenshot crop) straight into `mouse_move(use_screen_coordinates=true)`. +pub(crate) async fn ensure_global_xy_on_display( + host: &dyn crate::agentic::tools::computer_use_host::ComputerUseHost, + gx: f64, + gy: f64, +) -> BitFunResult<()> { + let displays = host.list_displays().await.unwrap_or_default(); + if displays.is_empty() { + // Host can't enumerate displays (non-desktop runtime) — skip the guard. + return Ok(()); + } + let on_any = displays.iter().any(|d| { + let x0 = d.origin_x as f64; + let y0 = d.origin_y as f64; + let x1 = x0 + d.width_logical as f64; + let y1 = y0 + d.height_logical as f64; + gx >= x0 && gx < x1 && gy >= y0 && gy < y1 + }); + if on_any { + return Ok(()); + } + let bounds: Vec = displays + .iter() + .map(|d| { + format!( + "display_id={} bounds=({},{})-({},{}) scale={:.2}", + d.display_id, + d.origin_x, + d.origin_y, + d.origin_x + d.width_logical as i32, + d.origin_y + d.height_logical as i32, + d.scale_factor + ) + }) + .collect(); + Err(BitFunError::tool(format!( + "[DESKTOP_COORD_OUT_OF_DISPLAY] global=({:.1},{:.1}) does not lie on any visible display. \ + Visible displays: [{}]. Hint: image-pixel coordinates are NOT screen coordinates. \ + Use screenshot.pointer_global, click_element/locate result.global_center_x/y, or move_to_text. \ + To convert image→global, use the screenshot's display_id + scale_factor.", + gx, + gy, + bounds.join("; ") + ))) +} + /// Absolute pointer move (`ComputerUseMousePrecise` tool). pub(crate) async fn computer_use_execute_mouse_precise( host_ref: &dyn crate::agentic::tools::computer_use_host::ComputerUseHost, @@ -717,6 +937,9 @@ pub(crate) async fn computer_use_execute_mouse_precise( let mode = coordinate_mode(input); let use_screen = use_screen_coordinates(input); let (sx64, sy64) = ComputerUseTool::resolve_xy_f64(host_ref, input, x, y)?; + if use_screen { + ensure_global_xy_on_display(host_ref, sx64, sy64).await?; + } host_ref.mouse_move_global_f64(sx64, sy64).await?; let sx = sx64.round() as i32; let sy = sy64.round() as i32; @@ -913,6 +1136,18 @@ fn parse_locate_query(input: &Value) -> UiElementLocateQuery { .get("filter_combine") .and_then(|v| v.as_str()) .map(|s| s.to_string()), + text_contains: input + .get("text_contains") + .and_then(|v| v.as_str()) + .map(|s| s.to_string()), + node_idx: input + .get("node_idx") + .and_then(|v| v.as_u64()) + .map(|v| v as u32), + app_state_digest: input + .get("app_state_digest") + .and_then(|v| v.as_str()) + .map(|s| s.to_string()), } } @@ -975,10 +1210,11 @@ impl Tool for ComputerUseTool { **ACTION PRIORITY (CRITICAL):** Always think in this order before choosing an action:\n\ 1. **Terminal/CLI/System commands first** — Use Bash tool for terminal commands, system scripts (e.g., macOS `osascript`, AppleScript), shell automation. This is the MOST EFFICIENT approach.\n\ 2. **Keyboard shortcuts second** — Use **`key_chord`** for system shortcuts, app shortcuts, navigation keys (Enter, Escape, Tab, Space, Arrow keys). Prefer over mouse when equivalent.\n\ -3. **Precise UI control last** — Only when above methods fail: use **`click_element`** (AX/accessibility) → **`move_to_text`** (OCR) → **`mouse_move`** + **`click`** (coordinate-based, last resort).\n\ +3. **Precise UI control last** — Only when above methods fail: prefer **`click_target`** / **`move_to_target`** (AX → OCR → screen coords in one call). Use lower-level **`click_element`**, **`move_to_text`**, or **`mouse_move`** + **`click`** only when you need manual disambiguation.\n\ **Screenshot usage:** **`screenshot`** is ONLY for observing/confirming UI state and extracting text/information — NEVER use screenshot coordinates to control mouse movement. Always use precise methods (AX, OCR, system coordinates) for targeting.\n\ **Cowork-style loop:** **`screenshot`** (observe) → **one** action → **`screenshot`** (verify). Use **`wait`** if UI animates. When **`interaction_state.recommend_screenshot_to_verify_last_action`** is true, call **`screenshot`** next. \ -**`click_element`:** Accessibility tree (AX/UIA/AT-SPI) locate + click. Provide `title_contains` / `role_substring` / `identifier_contains`. On macOS, **`TextArea`** and **`TextField`** match both `AXTextArea` and `AXTextField` (many chat apps use TextField for compose). If several text fields match, the host deprioritizes known **search** controls (e.g. WeChat `_SC_SEARCH_FIELD`) and prefers **lower** on-screen fields (composer). Bypasses coordinate screenshot guard. \ +**`click_target` / `move_to_target`:** Unified target resolver. In one call it tries AX (`node_idx`, `text_contains`, `title_contains`, `role_substring`, `identifier_contains`, or `target_text`) first, then OCR (`target_text` / `text_query`), then explicit global `x`/`y` with `use_screen_coordinates: true`. `click_target` moves and clicks authoritatively, avoiding the multi-step locate → move → screenshot → click loop for common targets. \ +**`click_element`:** Lower-level Accessibility tree (AX/UIA/AT-SPI) locate + click. Provide `title_contains` / `role_substring` / `identifier_contains`. On macOS, **`TextArea`** and **`TextField`** match both `AXTextArea` and `AXTextField` (many chat apps use TextField for compose). If several text fields match, the host deprioritizes known **search** controls (e.g. WeChat `_SC_SEARCH_FIELD`) and prefers **lower** on-screen fields (composer). Bypasses coordinate screenshot guard. \ **`move_to_text`:** OCR-match visible text (`text_query`) and **move the pointer** to it (no click, no keys); **no prior `screenshot` required for targeting** (host captures **raw** pixels for Vision — no agent screenshot overlays; on macOS defaults to the **frontmost window** unless **`ocr_region_native`** overrides). Matching **strips whitespace** between CJK glyphs and allows **small edit distance** when Vision mis-reads one character. The host **trusts** the resulting globals — **next `click`** does **not** require an extra `screenshot` (same as AX). If **several** hits match, the host returns **preview JPEGs + accessibility** per candidate — pick **`move_to_text_match_index`** (1-based) and call **`move_to_text` again** with the same query/region, or narrow with **`ocr_region_native`**. Use **`click`** afterward if you need a mouse press. Prefer after `click_element` misses when text is visible. \ **`click`:** Press at **current pointer only** — **never** pass `x`, `y`, `coordinate_mode`, or `use_screen_coordinates`. Position first with **`move_to_text`**, **`mouse_move`** (**globals only**), or **`click_element`**. After pointer moves, **`screenshot`** again before the next guarded **`click`** when the host requires it. \ **`mouse_move` / `drag`:** **`use_screen_coordinates`: true** required — global coordinates from **`move_to_text`**, **`locate`**, AX, or **`pointer_global`**; never JPEG pixel guesses. \ @@ -1010,8 +1246,8 @@ impl Tool for ComputerUseTool { "properties": { "action": { "type": "string", - "enum": ["screenshot", "click_element", "move_to_text", "click", "mouse_move", "scroll", "drag", "locate", "key_chord", "type_text", "pointer_move_rel", "wait", "open_app", "run_apple_script"], - "description": "The action to perform. **ACTION PRIORITY:** 1) Use Bash tool for CLI/terminal/system commands (most efficient). 2) **`open_app`** to launch apps by name. **`run_apple_script`** to run AppleScript (macOS). 3) Prefer **`key_chord`** for shortcuts/navigation keys over mouse. 4) Only when above fail: `click_element` (AX) → `move_to_text` (OCR, move pointer only) → `mouse_move` (globals only, **`use_screen_coordinates`: true**) + `click` (last resort). **`screenshot`** is for observation/confirmation ONLY — never derive mouse coordinates from screenshots. `click` = press at **current pointer only** (no x/y params). `scroll` supports optional position (`scroll_x`/`scroll_y`). `type_text`, `drag`, `pointer_move_rel`, `wait`, `locate` = standard actions." + "enum": ["screenshot", "click_target", "move_to_target", "click_element", "move_to_text", "click", "mouse_move", "scroll", "drag", "locate", "key_chord", "type_text", "pointer_move_rel", "wait", "open_app", "run_apple_script"], + "description": "The action to perform. **ACTION PRIORITY:** 1) Use Bash tool for CLI/terminal/system commands (most efficient). 2) **`open_app`** to launch apps by name. **`run_apple_script`** to run AppleScript (macOS). 3) Prefer **`key_chord`** for shortcuts/navigation keys over mouse. 4) Only when above fail: `click_target` / `move_to_target` (AX → OCR → screen coords in one call) before lower-level `click_element`, `move_to_text`, or `mouse_move` + `click`. **`screenshot`** is for observation/confirmation ONLY — never derive mouse coordinates from screenshots. `click` = press at **current pointer only** (no x/y params). `scroll` supports optional position (`scroll_x`/`scroll_y`). `type_text`, `drag`, `pointer_move_rel`, `wait`, `locate` = standard actions." }, "x": { "type": "integer", "description": "For `mouse_move` and `drag`: X in **global display** units when **`use_screen_coordinates`: true** (required). **Not** for `click`." }, "y": { "type": "integer", "description": "For `mouse_move` and `drag`: Y in **global display** units when **`use_screen_coordinates`: true** (required). **Not** for `click`." }, @@ -1019,8 +1255,8 @@ impl Tool for ComputerUseTool { "use_screen_coordinates": { "type": "boolean", "description": "For `mouse_move`, `drag`: **must be true** — global display coordinates (e.g. macOS points) from `move_to_text`, `locate`, AX, or `pointer_global`. **Not** for `click`." }, "button": { "type": "string", "enum": ["left", "right", "middle"], "description": "For `click`, `click_element`, `drag`: mouse button (default left)." }, "num_clicks": { "type": "integer", "minimum": 1, "maximum": 3, "description": "For `click`, `click_element`: 1=single (default), 2=double, 3=triple click." }, - "delta_x": { "type": "integer", "description": "For `pointer_move_rel`: horizontal delta (negative=left). **Not** allowed as the first move after `screenshot` (host). For `scroll`: horizontal wheel delta." }, - "delta_y": { "type": "integer", "description": "For `pointer_move_rel`: vertical delta (negative=up). **Not** allowed as the first move after `screenshot` (host). For `scroll`: vertical wheel delta." }, + "delta_x": { "type": "integer", "description": "For `pointer_move_rel`: horizontal delta (negative=left); also accepted as `dx`. **Not** allowed as the first move after `screenshot` (host). For `scroll`: horizontal wheel delta." }, + "delta_y": { "type": "integer", "description": "For `pointer_move_rel`: vertical delta (negative=up); also accepted as `dy`. **Not** allowed as the first move after `screenshot` (host). For `scroll`: vertical wheel delta." }, "start_x": { "type": "integer", "description": "For `drag`: start X coordinate." }, "start_y": { "type": "integer", "description": "For `drag`: start Y coordinate." }, "end_x": { "type": "integer", "description": "For `drag`: end X coordinate." }, @@ -1028,8 +1264,10 @@ impl Tool for ComputerUseTool { "keys": { "type": "array", "items": { "type": "string" }, "description": "For `key_chord`: keys in order — **modifiers first**, then the main key (e.g. `[\"command\",\"f\"]`). Desktop host waits after pressing modifiers so shortcuts register (important on macOS with IME). Modifiers: command, control, shift, alt/option. Arrows: `up`, `down`, … Host may require a fresh screenshot before Return/Enter when the pointer is stale." }, "text": { "type": "string", "description": "For `type_text`: text to type. Prefer clipboard paste (key_chord) for long content." }, "ms": { "type": "integer", "description": "For `wait`: duration in milliseconds." }, - "text_query": { "type": "string", "description": "For `move_to_text`: visible text to OCR-match on screen (case-insensitive substring)." }, - "move_to_text_match_index": { "type": "integer", "minimum": 1, "description": "For `move_to_text`: **1-based** index from `candidates[].match_index` after a **disambiguation** response (multiple OCR hits). Omit on the first pass; set when choosing which hit to move to." }, + "target_text": { "type": "string", "description": "For `move_to_target` / `click_target`: visible or accessible text. The resolver tries AX text first, then OCR text, without requiring a prior screenshot." }, + "target_match_index": { "type": "integer", "minimum": 1, "description": "For `move_to_target` / `click_target`: optional 1-based OCR match index when you want a specific candidate. Alias of `move_to_text_match_index` for the unified target actions." }, + "text_query": { "type": "string", "description": "For `move_to_text`, `move_to_target`, `click_target`: visible text to OCR-match on screen (case-insensitive substring)." }, + "move_to_text_match_index": { "type": "integer", "minimum": 1, "description": "For `move_to_text` and unified target actions: **1-based** OCR match index. For `move_to_text`, use after a disambiguation response; for `click_target`, use to pin a candidate." }, "ocr_region_native": { "type": "object", "description": "For `move_to_text`: optional global native rectangle for OCR. If omitted, macOS uses the frontmost window bounds from Accessibility; other OSes use the primary display. Overrides the automatic region when set. Requires x0, y0, width, height.", @@ -1040,11 +1278,14 @@ impl Tool for ComputerUseTool { "height": { "type": "integer", "minimum": 1, "description": "Height in the same coordinate unit as x0/y0 (logical on macOS)." } } }, - "title_contains": { "type": "string", "description": "For `locate`, `click_element`: case-insensitive substring match on accessible title (AXTitle). Use same language as the app UI." }, - "role_substring": { "type": "string", "description": "For `locate`, `click_element`: case-insensitive substring on AXRole (e.g. \"Button\", \"TextField\")." }, + "title_contains": { "type": "string", "description": "For `locate`, `click_element`: case-insensitive substring on AXTitle ONLY. Use same language as the app UI. Prefer `text_contains` (also covers AXValue/AXDescription/AXHelp) when in doubt." }, + "role_substring": { "type": "string", "description": "For `locate`, `click_element`: case-insensitive substring on AXRole **or AXSubrole** (e.g. \"Button\", \"TextField\", \"SearchField\")." }, "identifier_contains": { "type": "string", "description": "For `locate`, `click_element`: case-insensitive substring on AXIdentifier." }, - "max_depth": { "type": "integer", "minimum": 1, "maximum": 200, "description": "For `locate`, `click_element`: max BFS depth (default 48)." }, - "filter_combine": { "type": "string", "enum": ["all", "any"], "description": "For `locate`, `click_element`: `all` (default, AND) or `any` (OR) for filter combination." }, + "text_contains": { "type": "string", "description": "For `locate`, `click_element`: case-insensitive substring matched against ANY of AXTitle / AXValue / AXDescription / AXHelp. Best default when the visible label lives in value/description (e.g. AXStaticText cards)." }, + "node_idx": { "type": "integer", "minimum": 0, "description": "For `locate`, `click_element`: jump straight to a node returned by the most recent `desktop.get_app_state` (field `idx`). Bypasses BFS. macOS only; other platforms return AX_IDX_NOT_SUPPORTED." }, + "app_state_digest": { "type": "string", "description": "For `locate`, `click_element`: optional `state_digest` from the same `get_app_state` call that produced `node_idx`. Stale digest yields AX_IDX_STALE so you re-snapshot." }, + "max_depth": { "type": "integer", "minimum": 1, "maximum": 200, "description": "For `locate`, `click_element`: max BFS depth (default 48). Ignored when `node_idx` is supplied." }, + "filter_combine": { "type": "string", "enum": ["all", "any"], "description": "For `locate`, `click_element`: `all` (default, AND) or `any` (OR) for filter combination. Priority: `node_idx` > `text_contains` > `title_contains`+`role_substring`." }, "screenshot_crop_center_x": { "type": "integer", "minimum": 0, "description": "For `screenshot`: point crop X center in full-capture native pixels." }, "screenshot_crop_center_y": { "type": "integer", "minimum": 0, "description": "For `screenshot`: point crop Y center in full-capture native pixels." }, "screenshot_crop_half_extent_native": { "type": "integer", "minimum": 0, "description": "For `screenshot`: half-size of point crop in native pixels (default 250)." }, @@ -1122,15 +1363,107 @@ impl Tool for ComputerUseTool { match action { "locate" => execute_computer_use_locate(input, context).await, + // Unified target resolver: AX first, OCR second, explicit screen + // coordinates last. This is the preferred mouse path for common + // "move/click the visible thing" requests because it avoids + // spreading one intent across locate -> move -> click tool calls. + "move_to_target" | "click_target" => { + let should_click = action == "click_target"; + let target = Self::resolve_target_point(host_ref, input).await?; + host_ref.mouse_move_global_f64(target.x, target.y).await?; + if target.source == "ocr" { + ComputerUseHost::computer_use_trust_pointer_after_ocr_move(host_ref); + } + + let button = input + .get("button") + .and_then(|v| v.as_str()) + .unwrap_or("left"); + let num_clicks = input + .get("num_clicks") + .and_then(|v| v.as_u64()) + .unwrap_or(1) + .clamp(1, 3) as u32; + + if should_click { + for _ in 0..num_clicks { + host_ref.mouse_click_authoritative(button).await?; + } + } + + let target_source = target.source.clone(); + let input_coords = json!({ + "kind": action, + "source": target_source, + "resolved_global": { "x": target.x, "y": target.y }, + "button": if should_click { Some(button) } else { None }, + "num_clicks": if should_click { Some(num_clicks) } else { None }, + }); + let mut result_json = json!({ + "success": true, + "action": action, + "target_resolution_source": target.source, + "global_center_x": target.x, + "global_center_y": target.y, + "matched_text": target.matched_text, + "matched_role": target.matched_role, + "matched_identifier": target.matched_identifier, + "total_matches": target.total_matches, + "selected_match_index": target.selected_match_index, + "clicked": should_click, + "button": if should_click { Some(button) } else { None }, + "num_clicks": if should_click { Some(num_clicks) } else { None }, + }); + if let Some(warning) = target.warning { + result_json["warning"] = json!(warning); + } + if let Some(ax_error) = target.ax_error { + result_json["ax_fallback_error"] = json!(ax_error); + } + let body = + computer_use_augment_result_json(host_ref, result_json, Some(input_coords)) + .await; + let summary = if should_click { + format!( + "Resolved target via {} and clicked at ({:.0}, {:.0}).", + body.get("target_resolution_source") + .and_then(|v| v.as_str()) + .unwrap_or("target"), + target.x, + target.y + ) + } else { + format!( + "Resolved target via {} and moved pointer to ({:.0}, {:.0}).", + body.get("target_resolution_source") + .and_then(|v| v.as_str()) + .unwrap_or("target"), + target.x, + target.y + ) + }; + Ok(vec![ToolResult::ok(body, Some(summary))]) + } + // ---- NEW: click_element (locate + move + click in one call) ---- "click_element" => { let query = parse_locate_query(input); + // Accept ANY locator that can plausibly identify a node: + // - text_contains: wide needle over title|value|description|help + // - node_idx: direct AX-snapshot pin (zero-ambiguity) + // - title_contains / role_substring / identifier_contains: legacy filters + // The previous restriction (title/role/identifier only) blocked + // the most useful path — clicking by visible label that lives + // in AXValue/AXDescription — and forced models into brittle + // role guessing. if query.title_contains.is_none() + && query.text_contains.is_none() && query.role_substring.is_none() && query.identifier_contains.is_none() + && query.node_idx.is_none() { return Err(BitFunError::tool( - "click_element requires at least one of title_contains, role_substring, or identifier_contains.".to_string(), + "click_element requires at least one of text_contains, title_contains, role_substring, identifier_contains, or node_idx.".to_string(), )); } let button = input @@ -1399,6 +1732,9 @@ impl Tool for ComputerUseTool { let x = req_i32(input, "x")?; let y = req_i32(input, "y")?; let (sx64, sy64) = Self::resolve_xy_f64(host_ref, input, x, y)?; + if use_screen_coordinates(input) { + ensure_global_xy_on_display(host_ref, sx64, sy64).await?; + } host_ref.mouse_move_global_f64(sx64, sy64).await?; let mode = coordinate_mode(input); let use_screen = use_screen_coordinates(input); @@ -1581,31 +1917,51 @@ impl Tool for ComputerUseTool { } "pointer_move_rel" => { - let dx = input.get("delta_x").and_then(|v| v.as_i64()).unwrap_or(0) as i32; - let dy = input.get("delta_y").and_then(|v| v.as_i64()).unwrap_or(0) as i32; + // Accept both `delta_x`/`delta_y` (canonical) and `dx`/`dy` (alias) so that + // models which guess the natural form do not crash on the schema. + let dx_alias_used = input.get("delta_x").is_none() && input.get("dx").is_some(); + let dy_alias_used = input.get("delta_y").is_none() && input.get("dy").is_some(); + let dx = input + .get("delta_x") + .or_else(|| input.get("dx")) + .and_then(|v| v.as_i64()) + .unwrap_or(0) as i32; + let dy = input + .get("delta_y") + .or_else(|| input.get("dy")) + .and_then(|v| v.as_i64()) + .unwrap_or(0) as i32; if dx == 0 && dy == 0 { return Err(BitFunError::tool( - "pointer_move_rel requires non-zero delta_x and/or delta_y (screen pixels)" - .to_string(), + "pointer_move_rel requires a non-zero delta. Accepts `delta_x`|`dx` and `delta_y`|`dy` (screen pixels); at least one must be non-zero.".to_string(), )); } host_ref.pointer_move_relative(dx, dy).await?; - let input_coords = json!({ + let alias_note = match (dx_alias_used, dy_alias_used) { + (true, true) => Some("dx|dy"), + (true, false) => Some("dx"), + (false, true) => Some("dy"), + (false, false) => None, + }; + let mut input_coords = json!({ "kind": "pointer_move_rel", "delta_x": dx, "delta_y": dy, }); - let body = computer_use_augment_result_json( - host_ref, - json!({ - "success": true, - "action": "pointer_move_rel", - "delta_x": dx, - "delta_y": dy, - }), - Some(input_coords), - ) - .await; + if let Some(a) = alias_note { + input_coords["deprecated_alias_used"] = json!(a); + } + let mut payload = json!({ + "success": true, + "action": "pointer_move_rel", + "delta_x": dx, + "delta_y": dy, + }); + if let Some(a) = alias_note { + payload["deprecated_alias_used"] = json!(a); + } + let body = + computer_use_augment_result_json(host_ref, payload, Some(input_coords)).await; let summary = format!( "Moved pointer relatively by ({}, {}) screen pixels.", dx, dy @@ -1797,6 +2153,20 @@ impl Tool for ComputerUseTool { } } +#[derive(Debug, Clone)] +struct ResolvedDesktopTarget { + source: String, + x: f64, + y: f64, + matched_text: Option, + matched_role: Option, + matched_identifier: Option, + total_matches: Option, + selected_match_index: Option, + warning: Option, + ax_error: Option, +} + #[derive(Debug, Clone)] struct ScreenOcrTextMatch { text: String, diff --git a/src/crates/core/src/agentic/tools/implementations/control_hub/errors.rs b/src/crates/core/src/agentic/tools/implementations/control_hub/errors.rs index 131aaf764..15b5346c9 100644 --- a/src/crates/core/src/agentic/tools/implementations/control_hub/errors.rs +++ b/src/crates/core/src/agentic/tools/implementations/control_hub/errors.rs @@ -42,6 +42,34 @@ pub enum ErrorCode { /// The action requires a session / handle (e.g. `terminal_session_id`, /// `tab_handle`) that the caller did not provide. MissingSession, + /// AX-first desktop: the targeted application could not be resolved by + /// the supplied selector (name / bundle_id / pid). Distinct from + /// `NOT_FOUND` (which means a sub-element inside an app is missing). + AppNotFound, + /// AX-first desktop: a node `idx` provided by the caller is no longer + /// valid because the host has re-dumped the tree since the snapshot + /// the caller saw. Re-acquire via `desktop.get_app_state` and retry. + AxNodeStale, + /// AX-first desktop: this host cannot inject input events into the + /// target app without stealing user focus (e.g. macOS without + /// Accessibility permission, or non-macOS where the PID-event path is + /// not yet wired). Callers can fall back to the foreground + /// `desktop.click` path or escalate permissions. + BackgroundInputUnavailable, + /// AX-first desktop: the `node_idx` supplied to `click_element` / + /// `locate_element` is no longer present in the cached snapshot + /// (re-dump happened or window/state churned). Distinct from + /// `AX_NODE_STALE` which is for `app_*` actions; same recovery — + /// re-call `desktop.get_app_state` and reuse the new idx. + AxIdxStale, + /// AX-first desktop: this platform host does not support resolving + /// elements by `node_idx` (currently linux/windows). Caller should + /// fall back to `text_contains` / `title_contains` + `role_substring`. + AxIdxNotSupported, + /// `mouse_move(use_screen_coordinates=true)` got an `(x,y)` that + /// does not lie on any visible display. Almost always means the model + /// confused image-pixel coords with global screen coords. + DesktopCoordOutOfDisplay, } impl ErrorCode { @@ -62,6 +90,12 @@ impl ErrorCode { ErrorCode::Internal => "INTERNAL", ErrorCode::FrontendError => "FRONTEND_ERROR", ErrorCode::MissingSession => "MISSING_SESSION", + ErrorCode::AppNotFound => "APP_NOT_FOUND", + ErrorCode::AxNodeStale => "AX_NODE_STALE", + ErrorCode::BackgroundInputUnavailable => "BACKGROUND_INPUT_UNAVAILABLE", + ErrorCode::AxIdxStale => "AX_IDX_STALE", + ErrorCode::AxIdxNotSupported => "AX_IDX_NOT_SUPPORTED", + ErrorCode::DesktopCoordOutOfDisplay => "DESKTOP_COORD_OUT_OF_DISPLAY", } } @@ -88,6 +122,12 @@ impl ErrorCode { "INTERNAL" => Self::Internal, "FRONTEND_ERROR" => Self::FrontendError, "MISSING_SESSION" => Self::MissingSession, + "APP_NOT_FOUND" => Self::AppNotFound, + "AX_NODE_STALE" => Self::AxNodeStale, + "BACKGROUND_INPUT_UNAVAILABLE" => Self::BackgroundInputUnavailable, + "AX_IDX_STALE" => Self::AxIdxStale, + "AX_IDX_NOT_SUPPORTED" => Self::AxIdxNotSupported, + "DESKTOP_COORD_OUT_OF_DISPLAY" => Self::DesktopCoordOutOfDisplay, _ => return None, }) } diff --git a/src/crates/core/src/agentic/tools/implementations/control_hub_tool.rs b/src/crates/core/src/agentic/tools/implementations/control_hub_tool.rs index 5f09ba56e..d5bb7ccbc 100644 --- a/src/crates/core/src/agentic/tools/implementations/control_hub_tool.rs +++ b/src/crates/core/src/agentic/tools/implementations/control_hub_tool.rs @@ -15,7 +15,11 @@ use crate::agentic::tools::browser_control::session_registry::{ BrowserSession, BrowserSessionRegistry, }; use crate::agentic::tools::computer_use_capability::computer_use_desktop_available; -use crate::agentic::tools::computer_use_host::ComputerUseForegroundApplication; +use crate::agentic::tools::computer_use_host::{ + AppClickParams, AppSelector, AppWaitPredicate, ClickTarget, ComputerUseForegroundApplication, + ComputerUseHostRef, InteractiveClickParams, InteractiveScrollParams, InteractiveTypeTextParams, + InteractiveViewOpts, VisualClickParams, VisualMarkViewOpts, +}; use crate::agentic::tools::framework::{ Tool, ToolRenderOptions, ToolResult, ToolUseContext, ValidationResult, }; @@ -36,6 +40,55 @@ use super::control_hub::{err_response, ControlHubError, ErrorCode}; static BROWSER_SESSIONS: std::sync::OnceLock> = std::sync::OnceLock::new(); +/// Per-PID consecutive-failure tracker for the AX-first `app_*` actions. +/// Key = target PID, value = `(target_signature, before_digest, count)`. +/// When the same `(action,target)` lands on an unchanged digest twice in a +/// row the dispatcher injects an `app_state.loop_warning` so the model is +/// forced off the failing path on its **next** turn (`/Screenshot policy/ +/// Mandatory screenshot moments` in `claw_mode.md`). +static APP_LOOP_TRACKER: std::sync::OnceLock< + std::sync::Mutex>, +> = std::sync::OnceLock::new(); + +fn loop_tracker_observe( + pid: Option, + action: &str, + target_sig: &str, + before_digest: &str, + after_digest: &str, +) -> Option { + let pid = pid?; + // A digest change means the action mutated the tree — that is real + // progress and resets the streak even if the model picks the same + // target name on purpose (e.g. clicking "Next" repeatedly). + let progressed = before_digest != after_digest; + let sig = format!("{action}:{target_sig}"); + let mut guard = APP_LOOP_TRACKER + .get_or_init(|| std::sync::Mutex::new(std::collections::HashMap::new())) + .lock() + .ok()?; + let entry = guard + .entry(pid) + .or_insert_with(|| (String::new(), String::new(), 0)); + if progressed { + *entry = (sig, after_digest.to_string(), 1); + return None; + } + if entry.0 == sig && entry.1 == before_digest { + entry.2 = entry.2.saturating_add(1); + } else { + *entry = (sig, before_digest.to_string(), 1); + } + if entry.2 >= 2 { + Some(format!( + "Detected {} consecutive `{}` calls on the same target ({}) without any AX tree mutation (digest unchanged). The target is almost certainly invisible / disabled / in a Canvas-WebGL surface that AX cannot describe. NEXT TURN you MUST: (1) run `desktop.screenshot {{ screenshot_window: false }}` to see the full display, (2) switch tactic — different `node_idx`, different `ocr_text` needle, or a keyboard shortcut. Do NOT retry this same target a third time.", + entry.2, action, target_sig + )) + } else { + None + } +} + fn browser_sessions() -> Arc { BROWSER_SESSIONS .get_or_init(|| Arc::new(BrowserSessionRegistry::new())) @@ -145,7 +198,9 @@ impl ControlHubTool { ) -> Option { let guarded_actions = [ "click", + "click_target", "click_element", + "move_to_target", "mouse_move", "pointer_move_rel", "scroll", @@ -182,8 +237,102 @@ impl ControlHubTool { fn description_text(desktop_enabled: bool) -> String { let desktop_domain_doc = if desktop_enabled { r#"### domain: "desktop" (Computer Use — only available in the BitFun desktop app) -- screenshot, click, click_element, mouse_move, pointer_move_rel, - scroll, drag, key_chord, type_text, paste, wait, locate, move_to_text. + +#### desktop (AX-first, recommended for third-party apps) +- New Codex-style flow that targets a specific application by name / bundle + id / pid and drives it through its Accessibility (AX) tree instead of the + global mouse + screenshot loop. Strongly preferred whenever: + * you need to drive an app that is NOT in the user's foreground, OR + * you must not steal the user's mouse / keyboard focus, OR + * the target widget has a stable AX role / title / identifier (most native + macOS / AppKit / Catalyst / SwiftUI / Electron-with-AX-on apps qualify). +- Capability gating (read first, ALWAYS): `meta.capabilities` returns + `domains.desktop.supports_ax_tree`, `domains.desktop.supports_background_input`, + `domains.desktop.supports_interactive_view`, and + `domains.desktop.supports_visual_mark_view`. + AX tree and background input both `false` → the host cannot do AX-first yet; + fall back to the legacy screenshot/click flow below. Background input + `false` while AX tree `true` → AX *reads* work but writes will steal focus; + tell the user. +- Actions (all under `domain: "desktop"`): + * `list_apps { include_hidden? }` → ranked `[{ name, bundle_id?, pid, + is_running, last_used_ms?, launch_count? }]`. Use this to resolve a + fuzzy user phrase ("微信" / "WeChat" / "Cursor") to a concrete + `AppSelector` before any other AX call. + * `get_app_state { app: , max_depth?, focus_window_only? }` + → `{ app, window_title?, tree_text, nodes:[AxNode], digest, captured_at_ms }`. + `tree_text` is the human-readable indent dump (Codex parity); `nodes` is + the structured array with stable `idx` you pass to subsequent actions. + `digest` is a sha1 of the tree — use it to detect "did anything change?" + cheaply without re-diffing. + * `app_click { app, target: { kind:"node_idx", idx } | { kind:"image_xy", x, y, screenshot_id? } | { kind:"image_grid", x0, y0, width, height, rows, cols, row, col, intersections?, screenshot_id? } | { kind:"visual_grid", rows, cols, row, col, intersections? } | { kind:"screen_xy", x, y }, + click_count?, mouse_button?, modifier_keys?, wait_ms_after? }` → returns the + fresh `AppStateSnapshot` after the click. Prefer `node_idx` over + coordinate targets whenever the target appears in `nodes`. For Canvas / + SVG / WebGL/custom-drawn surfaces, prefer `image_xy`: x/y are pixels in + the screenshot attached to the latest `get_app_state` / `app_click`. + Always pass `screenshot_id` from `app_state.screenshot_meta` when present + so the host maps against the exact frame you clicked from. + For board/grid/canvas controls, prefer `image_grid` over raw `image_xy`: + specify the board rectangle in screenshot pixels and a zero-based + `row`/`col`; set `intersections:true` for Go/Gomoku-style line + intersections and `false`/omit it for cell centers. + If the grid rectangle is not known, use `visual_grid`: the host captures + the app, detects the regular visual grid from pixels, then clicks the + requested zero-based row/col using the same captured coordinate basis. + For games / animated WebViews, pass `wait_ms_after` (e.g. 300–600) so the + returned screenshot captures the settled board. + * `build_visual_mark_view { app, opts?: { max_points?, region?, include_grid? } }` + → returns a numbered screenshot grid for arbitrary visual targets that + AX/OCR cannot name (Canvas, games, maps, drawings, icon-only panels). + Use this after `get_app_state` / `build_interactive_view` does not expose + the target. Pass `region` in screenshot pixels to refine into a smaller + area on the next attempt. + * `visual_click { app, i, before_view_digest?, click_count?, mouse_button?, wait_ms_after?, return_view? }` + → clicks the numbered visual mark using the exact screenshot coordinate + basis from the marked view, then returns fresh app state. + * `app_type_text { app, text, focus?: ClickTarget }` — focuses the optional + target first, then types. Honors IME / emoji / CJK via paste-style + injection where the host supports it. + * `app_scroll { app, focus?: ClickTarget, dx, dy }` — pixel deltas inside + the focused scroll container; use negative `dy` to scroll content up. + * `app_key_chord { app, keys:["command","shift","p"], focus_idx? }` — sends + a chord to the app *without* surfacing a global key event; modifier + names match the legacy `key_chord` (command/control/option|alt/shift). + * `app_wait_for { app, predicate, timeout_ms?, poll_ms? }` where + `predicate` is one of `{ kind:"digest_changed", prev_digest }`, + `{ kind:"title_contains", needle }`, + `{ kind:"role_enabled", role, title? }`, `{ kind:"node_enabled", idx }`. + This is the AX equivalent of the `wait` + re-screenshot loop and is + REQUIRED between actions when the next step depends on a state change. +- Selector shape: `{ pid }` is most precise (always survives renames); + `{ bundle_id }` is next-best (survives localization); `{ name }` matches + on the localized window/app name. Combine fields and the host picks the + strongest match. Unresolved selector → `error.code = APP_NOT_FOUND`. +- Stale node refs (e.g. you cached `idx=42` from a snapshot, then the app + re-rendered) → `error.code = AX_NODE_STALE`. Always re-call + `get_app_state` and re-resolve by role/title/identifier — never carry an + `idx` across user-visible mutations without `app_wait_for`. +- If `supports_background_input` is `false` and the host still cannot + silently inject into the target, AX-first writes return + `error.code = BACKGROUND_INPUT_UNAVAILABLE` with a hint pointing at the + legacy foreground click; don't retry without a strategy change. +- Envelope additions for AX-first results: each successful response embeds + `target_app`, `app_state` (text dump), `app_state_nodes` (structured), + `before_digest` (the digest seen *before* the action), `after_digest` (the + digest *after*), and `background_input: bool` so the agent can verify the + action landed without stealing focus. + +#### desktop (legacy screenshot + global pointer) +- screenshot, click_target, move_to_target, click, click_element, mouse_move, + pointer_move_rel, scroll, drag, key_chord, type_text, paste, wait, locate, + move_to_text. +- **`click_target` / `move_to_target`** — preferred mouse primitive for + common "click/move to this visible thing" requests. One call resolves the + target by AX (`node_idx`, text/role/title/identifier filters, or + `target_text`) first, OCR second (`target_text` / `text_query`), and + explicit global `x`/`y` last. This collapses the old locate → move → + guarded-click round-trip into a single authoritative action. - **`screenshot`** — exactly two possible outputs: the focused application window (default, via Accessibility) OR the full display (fallback when AX cannot resolve the window). No crop / quadrant / mouse-centered @@ -276,7 +425,10 @@ Every call returns a JSON object with a stable shape: "error": {{ "code": "STALE_REF" | "NOT_FOUND" | "AMBIGUOUS" | "GUARD_REJECTED" | "WRONG_DISPLAY" | "WRONG_TAB" | "INVALID_PARAMS" | "PERMISSION_DENIED" | "TIMEOUT" | "NOT_AVAILABLE" - | "MISSING_SESSION" | "FRONTEND_ERROR" | "INTERNAL", + | "MISSING_SESSION" | "FRONTEND_ERROR" | "INTERNAL" + | "APP_NOT_FOUND" | "AX_NODE_STALE" | "AX_IDX_STALE" + | "AX_IDX_NOT_SUPPORTED" | "DESKTOP_COORD_OUT_OF_DISPLAY" + | "BACKGROUND_INPUT_UNAVAILABLE", "message": "...", "hints": [ "...next step..." ] }} }} Branch on `ok` and on `error.code` deterministically. Never scrape the English `message` @@ -414,7 +566,7 @@ for control flow. &self, action: &str, params: &Value, - _context: &ToolUseContext, + context: &ToolUseContext, ) -> BitFunResult> { match action { "capabilities" => { @@ -472,9 +624,30 @@ for control flow. Option, ) = (None, None); + let desktop_host = context.computer_use_host.as_ref(); + let desktop_ax_tree = desktop_host + .map(|host| host.supports_ax_tree()) + .unwrap_or(false); + let desktop_background_input = desktop_host + .map(|host| host.supports_background_input()) + .unwrap_or(false); + let desktop_interactive_view = desktop_host + .map(|host| host.supports_interactive_view()) + .unwrap_or(false); + let desktop_visual_mark_view = desktop_host + .map(|host| host.supports_visual_mark_view()) + .unwrap_or(false); + let body = json!({ "domains": { - "desktop": { "available": desktop_available, "reason": if desktop_available { Value::Null } else { json!("Only available in the BitFun desktop app") } }, + "desktop": { + "available": desktop_available, + "reason": if desktop_available { Value::Null } else { json!("Only available in the BitFun desktop app") }, + "supports_ax_tree": desktop_ax_tree, + "supports_background_input": desktop_background_input, + "supports_interactive_view": desktop_interactive_view, + "supports_visual_mark_view": desktop_visual_mark_view, + }, "browser": { "available": true, "default_session_id": browser_default, @@ -761,6 +934,24 @@ for control flow. )]); } + // ── AX-first actions (Codex parity) ─────────────────────── + // These bypass the legacy ComputerUseTool because they + // operate on the new typed AppSelector / AxNode envelope. + "list_apps" + | "get_app_state" + | "app_click" + | "app_type_text" + | "app_scroll" + | "app_key_chord" + | "app_wait_for" + | "build_interactive_view" + | "interactive_click" + | "interactive_type_text" + | "interactive_scroll" + | "build_visual_mark_view" + | "visual_click" => { + return self.handle_desktop_ax(host, action, params).await; + } "focus_display" => { // Accept `null` (or omitted `display_id`) to clear the pin // and fall back to "screen under the pointer". An explicit @@ -827,6 +1018,896 @@ for control flow. cu_tool.call_impl(&cu_input, context).await } + // ── Desktop AX-first dispatch (Codex parity) ────────────────────── + // Routes the seven new app-targeted actions through the typed + // `ComputerUseHost` API. Every successful response carries a + // unified envelope: `target_app`, `background_input`, + // `before_digest` and (for state queries) `app_state` / + // `app_state_nodes` so the model can reason about the AX tree + // before/after each action without re-querying. + async fn handle_desktop_ax( + &self, + host: &ComputerUseHostRef, + action: &str, + params: &Value, + ) -> BitFunResult> { + // ── Helpers ───────────────────────────────────────────────── + fn parse_selector(v: &Value) -> BitFunResult { + let obj = v.get("app").ok_or_else(|| { + BitFunError::tool( + "[INVALID_PARAMS] missing 'app' selector (pid|bundle_id|name)".to_string(), + ) + })?; + let sel: AppSelector = serde_json::from_value(obj.clone()).map_err(|e| { + BitFunError::tool(format!( + "[INVALID_PARAMS] bad 'app' selector: {} (expect {{pid|bundle_id|name}})", + e + )) + })?; + if sel.pid.is_none() && sel.bundle_id.is_none() && sel.name.is_none() { + return Err(BitFunError::tool( + "[INVALID_PARAMS] 'app' must include at least one of pid|bundle_id|name" + .to_string(), + )); + } + Ok(sel) + } + + fn parse_click_target(v: &Value) -> BitFunResult { + if v.get("kind").is_some() { + return serde_json::from_value(v.clone()).map_err(|e| { + BitFunError::tool(format!( + "[INVALID_PARAMS] bad ClickTarget: {} (expected {{\"kind\":\"node_idx\",\"idx\":N}}, {{\"kind\":\"image_xy\",\"x\":0,\"y\":0}}, {{\"kind\":\"image_grid\",\"x0\":0,\"y0\":0,\"width\":300,\"height\":300,\"rows\":15,\"cols\":15,\"row\":7,\"col\":7,\"intersections\":true}}, {{\"kind\":\"visual_grid\",\"rows\":15,\"cols\":15,\"row\":7,\"col\":7,\"intersections\":true}}, {{\"kind\":\"screen_xy\",\"x\":0,\"y\":0}}, or {{\"kind\":\"ocr_text\",\"needle\":\"...\"}})", + e + )) + }); + } + if let Some(idx) = v.get("node_idx").and_then(|x| x.as_u64()) { + return Ok(ClickTarget::NodeIdx { idx: idx as u32 }); + } + if let Some(obj) = v.get("screen_xy") { + let x = obj.get("x").and_then(|x| x.as_f64()).ok_or_else(|| { + BitFunError::tool( + "[INVALID_PARAMS] screen_xy target requires numeric x".to_string(), + ) + })?; + let y = obj.get("y").and_then(|y| y.as_f64()).ok_or_else(|| { + BitFunError::tool( + "[INVALID_PARAMS] screen_xy target requires numeric y".to_string(), + ) + })?; + return Ok(ClickTarget::ScreenXy { x, y }); + } + if let Some(obj) = v.get("image_xy") { + let x = obj.get("x").and_then(|x| x.as_i64()).ok_or_else(|| { + BitFunError::tool( + "[INVALID_PARAMS] image_xy target requires integer x".to_string(), + ) + })?; + let y = obj.get("y").and_then(|y| y.as_i64()).ok_or_else(|| { + BitFunError::tool( + "[INVALID_PARAMS] image_xy target requires integer y".to_string(), + ) + })?; + return Ok(ClickTarget::ImageXy { + x: x as i32, + y: y as i32, + screenshot_id: obj + .get("screenshot_id") + .and_then(|v| v.as_str()) + .map(|s| s.to_string()), + }); + } + if let Some(obj) = v.get("image_grid") { + let target = json!({ + "kind": "image_grid", + "x0": obj.get("x0").cloned().unwrap_or(Value::Null), + "y0": obj.get("y0").cloned().unwrap_or(Value::Null), + "width": obj.get("width").cloned().unwrap_or(Value::Null), + "height": obj.get("height").cloned().unwrap_or(Value::Null), + "rows": obj.get("rows").cloned().unwrap_or(Value::Null), + "cols": obj.get("cols").cloned().unwrap_or(Value::Null), + "row": obj.get("row").cloned().unwrap_or(Value::Null), + "col": obj.get("col").cloned().unwrap_or(Value::Null), + "intersections": obj.get("intersections").cloned().unwrap_or(json!(false)), + "screenshot_id": obj.get("screenshot_id").cloned().unwrap_or(Value::Null), + }); + return serde_json::from_value(target).map_err(|e| { + BitFunError::tool(format!( + "[INVALID_PARAMS] bad image_grid target: {} (need x0,y0,width,height,rows,cols,row,col; optional intersections)", + e + )) + }); + } + if let Some(obj) = v.get("visual_grid") { + let target = json!({ + "kind": "visual_grid", + "rows": obj.get("rows").cloned().unwrap_or(Value::Null), + "cols": obj.get("cols").cloned().unwrap_or(Value::Null), + "row": obj.get("row").cloned().unwrap_or(Value::Null), + "col": obj.get("col").cloned().unwrap_or(Value::Null), + "intersections": obj.get("intersections").cloned().unwrap_or(json!(false)), + "wait_ms_after_detection": obj.get("wait_ms_after_detection").cloned().unwrap_or(Value::Null), + }); + return serde_json::from_value(target).map_err(|e| { + BitFunError::tool(format!( + "[INVALID_PARAMS] bad visual_grid target: {} (need rows,cols,row,col; optional intersections)", + e + )) + }); + } + if v.get("x").is_some() || v.get("y").is_some() { + let x = v.get("x").and_then(|x| x.as_f64()).ok_or_else(|| { + BitFunError::tool( + "[INVALID_PARAMS] screen target requires numeric x".to_string(), + ) + })?; + let y = v.get("y").and_then(|y| y.as_f64()).ok_or_else(|| { + BitFunError::tool( + "[INVALID_PARAMS] screen target requires numeric y".to_string(), + ) + })?; + return Ok(ClickTarget::ScreenXy { x, y }); + } + if let Some(ocr) = v.get("ocr_text") { + let needle = ocr + .get("needle") + .or_else(|| ocr.get("text")) + .and_then(|x| x.as_str()) + .ok_or_else(|| { + BitFunError::tool( + "[INVALID_PARAMS] ocr_text target requires needle".to_string(), + ) + })?; + return Ok(ClickTarget::OcrText { + needle: needle.to_string(), + }); + } + Err(BitFunError::tool( + "[INVALID_PARAMS] unsupported ClickTarget. Use {\"kind\":\"node_idx\",\"idx\":N}, {\"node_idx\":N}, {\"kind\":\"image_xy\",\"x\":0,\"y\":0}, {\"image_xy\":{\"x\":0,\"y\":0}}, {\"kind\":\"image_grid\",\"x0\":0,\"y0\":0,\"width\":300,\"height\":300,\"rows\":15,\"cols\":15,\"row\":7,\"col\":7,\"intersections\":true}, {\"kind\":\"visual_grid\",\"rows\":15,\"cols\":15,\"row\":7,\"col\":7,\"intersections\":true}, {\"kind\":\"screen_xy\",\"x\":0,\"y\":0}, or {\"ocr_text\":{\"needle\":\"...\"}}.".to_string(), + )) + } + + fn parse_wait_predicate(v: &Value) -> BitFunResult { + if v.get("kind").is_some() { + return serde_json::from_value(v.clone()).map_err(|e| { + BitFunError::tool(format!( + "[INVALID_PARAMS] bad app_wait_for predicate: {}", + e + )) + }); + } + if let Some(obj) = v.get("digest_changed") { + let prev_digest = obj + .get("prev_digest") + .or_else(|| obj.get("from")) + .and_then(|x| x.as_str()) + .ok_or_else(|| { + BitFunError::tool( + "[INVALID_PARAMS] digest_changed requires prev_digest".to_string(), + ) + })?; + return Ok(AppWaitPredicate::DigestChanged { + prev_digest: prev_digest.to_string(), + }); + } + if let Some(obj) = v.get("title_contains") { + let needle = obj + .get("needle") + .or_else(|| obj.get("title")) + .and_then(|x| x.as_str()) + .or_else(|| obj.as_str()) + .ok_or_else(|| { + BitFunError::tool( + "[INVALID_PARAMS] title_contains requires needle".to_string(), + ) + })?; + return Ok(AppWaitPredicate::TitleContains { + needle: needle.to_string(), + }); + } + if let Some(obj) = v.get("role_enabled") { + let role = obj.get("role").and_then(|x| x.as_str()).ok_or_else(|| { + BitFunError::tool("[INVALID_PARAMS] role_enabled requires role".to_string()) + })?; + return Ok(AppWaitPredicate::RoleEnabled { + role: role.to_string(), + }); + } + if let Some(obj) = v.get("node_enabled") { + let idx = obj + .get("idx") + .and_then(|x| x.as_u64()) + .or_else(|| obj.as_u64()) + .ok_or_else(|| { + BitFunError::tool("[INVALID_PARAMS] node_enabled requires idx".to_string()) + })?; + return Ok(AppWaitPredicate::NodeEnabled { idx: idx as u32 }); + } + Err(BitFunError::tool( + "[INVALID_PARAMS] unsupported app_wait_for predicate. Use {\"kind\":\"digest_changed\",\"prev_digest\":\"...\"} or shorthand {\"digest_changed\":{\"prev_digest\":\"...\"}}.".to_string(), + )) + } + + fn parse_keys(v: &Value) -> Vec { + match v.get("keys").or_else(|| v.get("key")) { + Some(Value::Array(arr)) => arr + .iter() + .filter_map(|x| x.as_str().map(|s| s.to_string())) + .collect(), + Some(Value::String(s)) => vec![s.to_string()], + _ => Vec::new(), + } + } + + // Build the JSON view of an AppStateSnapshot for the model. Excludes + // the heavy `screenshot` payload (it is attached out-of-band as a + // multimodal image, not as base64 inside the JSON tree, to keep token + // budgets under control and let the provider deliver it as `image_url`). + fn snap_state_json( + snap: &crate::agentic::tools::computer_use_host::AppStateSnapshot, + ) -> serde_json::Value { + let mut v = json!({ + "app": snap.app, + "window_title": snap.window_title, + "digest": snap.digest, + "captured_at_ms": snap.captured_at_ms, + "tree_text": snap.tree_text, + "has_screenshot": snap.screenshot.is_some(), + }); + if let Some(shot) = snap.screenshot.as_ref() { + if let Some(obj) = v.as_object_mut() { + let meta: serde_json::Value = json!({ + "image_width": shot.image_width, + "image_height": shot.image_height, + "screenshot_id": shot.screenshot_id, + "native_width": shot.native_width, + "native_height": shot.native_height, + "vision_scale": shot.vision_scale, + "mime_type": shot.mime_type, + "image_content_rect": shot.image_content_rect, + "image_global_bounds": shot.image_global_bounds, + "coordinate_hint": "For visual surfaces, click pixels in this attached image with app_click target {kind:\"image_xy\", x, y, screenshot_id}. For known boards/grids/canvases, prefer {kind:\"image_grid\", x0, y0, width, height, rows, cols, row, col, intersections, screenshot_id}. If the grid rectangle is unknown, use {kind:\"visual_grid\", rows, cols, row, col, intersections}; the host detects the grid from app pixels.", + }); + obj.insert("screenshot_meta".to_string(), meta); + } + } + v + } + + // Helper: build a `ToolResult` that *also* carries the focused-window + // screenshot as an Anthropic-style multimodal image attachment. When + // the host couldn't (or chose not to) capture, fall back to a regular + // text-only `ToolResult::ok`. + fn snap_result( + data: serde_json::Value, + summary: Option, + snap: &crate::agentic::tools::computer_use_host::AppStateSnapshot, + ) -> ToolResult { + use base64::Engine as _; + if let Some(shot) = snap.screenshot.as_ref() { + let attach = crate::util::types::ToolImageAttachment { + mime_type: shot.mime_type.clone(), + data_base64: base64::engine::general_purpose::STANDARD.encode(&shot.bytes), + }; + ToolResult::ok_with_images(data, summary, vec![attach]) + } else { + ToolResult::ok(data, summary) + } + } + + // Build a JSON view of an InteractiveView that excludes the heavy + // `screenshot.bytes` payload (the JPEG is attached out-of-band as a + // multimodal image attachment, not as base64 inside the tree). + fn build_interactive_view_json( + view: &crate::agentic::tools::computer_use_host::InteractiveView, + ) -> serde_json::Value { + let mut v = json!({ + "app": view.app, + "window_title": view.window_title, + "digest": view.digest, + "captured_at_ms": view.captured_at_ms, + "elements": view.elements, + "tree_text": view.tree_text, + "loop_warning": view.loop_warning, + "has_screenshot": view.screenshot.is_some(), + }); + if let Some(shot) = view.screenshot.as_ref() { + if let Some(obj) = v.as_object_mut() { + obj.insert( + "screenshot_meta".to_string(), + json!({ + "image_width": shot.image_width, + "image_height": shot.image_height, + "screenshot_id": shot.screenshot_id, + "native_width": shot.native_width, + "native_height": shot.native_height, + "vision_scale": shot.vision_scale, + "mime_type": shot.mime_type, + "image_content_rect": shot.image_content_rect, + "image_global_bounds": shot.image_global_bounds, + "coordinate_hint": "Numbered overlays are in JPEG image-pixel space. Reference elements via their `i` index using interactive_click / interactive_type_text / interactive_scroll. For pointer-only fallback, pass screenshot_id with image_xy/image_grid.", + }), + ); + } + } + v + } + + fn build_visual_mark_view_json( + view: &crate::agentic::tools::computer_use_host::VisualMarkView, + ) -> serde_json::Value { + let mut v = json!({ + "app": view.app, + "window_title": view.window_title, + "digest": view.digest, + "captured_at_ms": view.captured_at_ms, + "marks": view.marks, + "has_screenshot": view.screenshot.is_some(), + }); + if let Some(shot) = view.screenshot.as_ref() { + if let Some(obj) = v.as_object_mut() { + obj.insert( + "screenshot_meta".to_string(), + json!({ + "image_width": shot.image_width, + "image_height": shot.image_height, + "screenshot_id": shot.screenshot_id, + "native_width": shot.native_width, + "native_height": shot.native_height, + "vision_scale": shot.vision_scale, + "mime_type": shot.mime_type, + "image_content_rect": shot.image_content_rect, + "image_global_bounds": shot.image_global_bounds, + "coordinate_hint": "Numbered visual marks are in JPEG image-pixel space. Reference marks via their `i` index using visual_click. To refine a dense area, call build_visual_mark_view again with opts.region in these screenshot pixels.", + }), + ); + } + } + v + } + + // Build a JSON envelope for interactive_* action results. Includes + // the post-action AppStateSnapshot (without screenshot bytes) and, + // when present, the rebuilt InteractiveView. + fn build_interactive_action_json( + app: &crate::agentic::tools::computer_use_host::AppSelector, + res: &crate::agentic::tools::computer_use_host::InteractiveActionResult, + extras: serde_json::Value, + ) -> serde_json::Value { + let mut v = json!({ + "target_app": app, + "app_state": snap_state_json(&res.snapshot), + "app_state_nodes": res.snapshot.nodes, + "loop_warning": res.snapshot.loop_warning, + "execution_note": res.execution_note, + "interactive_view": res.view.as_ref().map(build_interactive_view_json), + }); + if let (Some(obj), Some(extras_obj)) = (v.as_object_mut(), extras.as_object()) { + for (k, val) in extras_obj { + obj.insert(k.clone(), val.clone()); + } + } + v + } + + fn build_visual_action_json( + app: &crate::agentic::tools::computer_use_host::AppSelector, + res: &crate::agentic::tools::computer_use_host::VisualActionResult, + extras: serde_json::Value, + ) -> serde_json::Value { + let mut v = json!({ + "target_app": app, + "app_state": snap_state_json(&res.snapshot), + "app_state_nodes": res.snapshot.nodes, + "loop_warning": res.snapshot.loop_warning, + "execution_note": res.execution_note, + "visual_mark_view": res.view.as_ref().map(build_visual_mark_view_json), + }); + if let (Some(obj), Some(extras_obj)) = (v.as_object_mut(), extras.as_object()) { + for (k, val) in extras_obj { + obj.insert(k.clone(), val.clone()); + } + } + v + } + + // Attach the InteractiveView's annotated screenshot (if present) + // as a multimodal image; otherwise fall back to text-only ok. + fn interactive_view_result( + data: serde_json::Value, + summary: Option, + view: &crate::agentic::tools::computer_use_host::InteractiveView, + ) -> ToolResult { + use base64::Engine as _; + if let Some(shot) = view.screenshot.as_ref() { + let attach = crate::util::types::ToolImageAttachment { + mime_type: shot.mime_type.clone(), + data_base64: base64::engine::general_purpose::STANDARD.encode(&shot.bytes), + }; + ToolResult::ok_with_images(data, summary, vec![attach]) + } else { + ToolResult::ok(data, summary) + } + } + + fn visual_mark_view_result( + data: serde_json::Value, + summary: Option, + view: &crate::agentic::tools::computer_use_host::VisualMarkView, + ) -> ToolResult { + use base64::Engine as _; + if let Some(shot) = view.screenshot.as_ref() { + let attach = crate::util::types::ToolImageAttachment { + mime_type: shot.mime_type.clone(), + data_base64: base64::engine::general_purpose::STANDARD.encode(&shot.bytes), + }; + ToolResult::ok_with_images(data, summary, vec![attach]) + } else { + ToolResult::ok(data, summary) + } + } + + // Prefer attaching the rebuilt interactive view's screenshot when + // available; otherwise fall back to the post-action snapshot's. + fn interactive_action_result( + data: serde_json::Value, + summary: Option, + res: &crate::agentic::tools::computer_use_host::InteractiveActionResult, + ) -> ToolResult { + use base64::Engine as _; + let shot_opt = res + .view + .as_ref() + .and_then(|v| v.screenshot.as_ref()) + .or(res.snapshot.screenshot.as_ref()); + if let Some(shot) = shot_opt { + let attach = crate::util::types::ToolImageAttachment { + mime_type: shot.mime_type.clone(), + data_base64: base64::engine::general_purpose::STANDARD.encode(&shot.bytes), + }; + ToolResult::ok_with_images(data, summary, vec![attach]) + } else { + ToolResult::ok(data, summary) + } + } + + fn visual_action_result( + data: serde_json::Value, + summary: Option, + res: &crate::agentic::tools::computer_use_host::VisualActionResult, + ) -> ToolResult { + use base64::Engine as _; + let shot_opt = res + .view + .as_ref() + .and_then(|v| v.screenshot.as_ref()) + .or(res.snapshot.screenshot.as_ref()); + if let Some(shot) = shot_opt { + let attach = crate::util::types::ToolImageAttachment { + mime_type: shot.mime_type.clone(), + data_base64: base64::engine::general_purpose::STANDARD.encode(&shot.bytes), + }; + ToolResult::ok_with_images(data, summary, vec![attach]) + } else { + ToolResult::ok(data, summary) + } + } + + let bg = host.supports_background_input(); + let ax = host.supports_ax_tree(); + + match action { + "list_apps" => { + let include_hidden = params + .get("include_hidden") + .and_then(|v| v.as_bool()) + .unwrap_or_else(|| { + !params + .get("only_visible") + .and_then(|v| v.as_bool()) + .unwrap_or(true) + }); + let apps = host.list_apps(include_hidden).await?; + let n = apps.len(); + Ok(vec![ToolResult::ok( + json!({ + "apps": apps, + "include_hidden": include_hidden, + "background_input": bg, + "ax_tree": ax, + }), + Some(format!("{} app(s) listed", n)), + )]) + } + "get_app_state" => { + let app = parse_selector(params)?; + let max_depth = params + .get("max_depth") + .and_then(|v| v.as_u64()) + .unwrap_or(32) as u32; + let focus_window_only = params + .get("focus_window_only") + .and_then(|v| v.as_bool()) + .unwrap_or(false); + let snap = host + .get_app_state(app.clone(), max_depth, focus_window_only) + .await?; + let summary = format!( + "AX state for {} (digest={}, {} nodes)", + snap.app.name, + &snap.digest[..snap.digest.len().min(12)], + snap.nodes.len() + ); + let data = json!({ + "target_app": app, + "background_input": bg, + "ax_tree": ax, + "app_state": snap_state_json(&snap), + "app_state_nodes": snap.nodes, + "before_digest": snap.digest, + "loop_warning": snap.loop_warning, + }); + Ok(vec![snap_result(data, Some(summary), &snap)]) + } + "app_click" => { + let app = parse_selector(params)?; + let target_v = params.get("target").cloned().ok_or_else(|| { + BitFunError::tool( + "[INVALID_PARAMS] app_click requires 'target' ({node_idx|image_xy|screen_xy|ocr_text})" + .to_string(), + ) + })?; + let target = parse_click_target(&target_v)?; + let click_count = params + .get("click_count") + .and_then(|v| v.as_u64()) + .unwrap_or(1) as u8; + let mouse_button = params + .get("mouse_button") + .and_then(|v| v.as_str()) + .unwrap_or("left") + .to_string(); + let modifier_keys: Vec = params + .get("modifier_keys") + .and_then(|v| v.as_array()) + .map(|a| { + a.iter() + .filter_map(|x| x.as_str().map(|s| s.to_string())) + .collect() + }) + .unwrap_or_default(); + let wait_ms_after = params + .get("wait_ms_after") + .or_else(|| params.get("post_click_wait_ms")) + .and_then(|v| v.as_u64()) + .map(|v| v.min(5_000) as u32); + + let before = host + .get_app_state(app.clone(), 8, false) + .await + .ok() + .map(|s| s.digest); + + let mut after = host + .app_click(AppClickParams { + app: app.clone(), + target: target.clone(), + click_count, + mouse_button, + modifier_keys, + wait_ms_after, + }) + .await?; + + if after.loop_warning.is_none() { + let target_sig = serde_json::to_string(&target).unwrap_or_default(); + after.loop_warning = loop_tracker_observe( + app.pid, + "app_click", + &target_sig, + before.as_deref().unwrap_or(""), + &after.digest, + ); + } + + let data = json!({ + "target_app": app, + "click_target": target, + "background_input": bg, + "before_digest": before, + "app_state": snap_state_json(&after), + "app_state_nodes": after.nodes, + "loop_warning": after.loop_warning, + }); + Ok(vec![snap_result(data, Some("clicked".to_string()), &after)]) + } + "app_type_text" => { + let app = parse_selector(params)?; + let text = params + .get("text") + .and_then(|v| v.as_str()) + .ok_or_else(|| { + BitFunError::tool( + "[INVALID_PARAMS] app_type_text requires 'text'".to_string(), + ) + })? + .to_string(); + let focus: Option = match params.get("focus") { + Some(v) if !v.is_null() => Some(parse_click_target(v)?), + _ => None, + }; + let before = host + .get_app_state(app.clone(), 8, false) + .await + .ok() + .map(|s| s.digest); + let mut after = host + .app_type_text(app.clone(), &text, focus.clone()) + .await?; + if after.loop_warning.is_none() { + let target_sig = format!( + "focus={};len={}", + serde_json::to_string(&focus).unwrap_or_default(), + text.chars().count() + ); + after.loop_warning = loop_tracker_observe( + app.pid, + "app_type_text", + &target_sig, + before.as_deref().unwrap_or(""), + &after.digest, + ); + } + let data = json!({ + "target_app": app, + "background_input": bg, + "char_count": text.chars().count(), + "focus": focus, + "before_digest": before, + "app_state": snap_state_json(&after), + "app_state_nodes": after.nodes, + "loop_warning": after.loop_warning, + }); + Ok(vec![snap_result( + data, + Some(format!("typed {} chars", text.chars().count())), + &after, + )]) + } + "app_scroll" => { + let app = parse_selector(params)?; + let dx = params.get("dx").and_then(|v| v.as_i64()).unwrap_or(0) as i32; + let dy = params.get("dy").and_then(|v| v.as_i64()).unwrap_or(0) as i32; + let focus: Option = match params.get("focus") { + Some(v) if !v.is_null() => Some(parse_click_target(v)?), + _ => None, + }; + let after = host.app_scroll(app.clone(), focus.clone(), dx, dy).await?; + let data = json!({ + "target_app": app, + "background_input": bg, + "dx": dx, + "dy": dy, + "focus": focus, + "app_state": snap_state_json(&after), + "app_state_nodes": after.nodes, + "loop_warning": after.loop_warning, + }); + Ok(vec![snap_result( + data, + Some(format!("scrolled ({},{})", dx, dy)), + &after, + )]) + } + "app_key_chord" => { + let app = parse_selector(params)?; + let keys = parse_keys(params); + if keys.is_empty() { + return Err(BitFunError::tool( + "[INVALID_PARAMS] app_key_chord requires non-empty 'keys'".to_string(), + )); + } + let focus_idx: Option = params + .get("focus_idx") + .and_then(|v| v.as_u64()) + .map(|n| n as u32); + let after = host + .app_key_chord(app.clone(), keys.clone(), focus_idx) + .await?; + let data = json!({ + "target_app": app, + "background_input": bg, + "keys": keys, + "focus_idx": focus_idx, + "app_state": snap_state_json(&after), + "app_state_nodes": after.nodes, + "loop_warning": after.loop_warning, + }); + Ok(vec![snap_result( + data, + Some("key chord sent".to_string()), + &after, + )]) + } + "app_wait_for" => { + let app = parse_selector(params)?; + let predicate_v = params.get("predicate").cloned().ok_or_else(|| { + BitFunError::tool( + "[INVALID_PARAMS] app_wait_for requires 'predicate'".to_string(), + ) + })?; + let predicate = parse_wait_predicate(&predicate_v)?; + let timeout_ms = params + .get("timeout_ms") + .and_then(|v| v.as_u64()) + .unwrap_or(8000) as u32; + let poll_ms = params + .get("poll_ms") + .and_then(|v| v.as_u64()) + .unwrap_or(150) as u32; + let after = host + .app_wait_for(app.clone(), predicate.clone(), timeout_ms, poll_ms) + .await?; + let data = json!({ + "target_app": app, + "background_input": bg, + "predicate": predicate, + "app_state": snap_state_json(&after), + "app_state_nodes": after.nodes, + "loop_warning": after.loop_warning, + }); + Ok(vec![snap_result( + data, + Some("predicate satisfied".to_string()), + &after, + )]) + } + "build_interactive_view" => { + let app = parse_selector(params)?; + let opts: InteractiveViewOpts = match params.get("opts") { + Some(v) if !v.is_null() => serde_json::from_value(v.clone()).map_err(|e| { + BitFunError::tool(format!( + "[INVALID_PARAMS] build_interactive_view 'opts' invalid: {}", + e + )) + })?, + _ => InteractiveViewOpts::default(), + }; + let view = host.build_interactive_view(app.clone(), opts).await?; + let view_json = build_interactive_view_json(&view); + let summary = format!( + "interactive view for {} ({} elements, digest={})", + view.app.name, + view.elements.len(), + &view.digest[..view.digest.len().min(12)] + ); + Ok(vec![interactive_view_result( + view_json, + Some(summary), + &view, + )]) + } + "interactive_click" => { + let app = parse_selector(params)?; + let p: InteractiveClickParams = + serde_json::from_value(params.clone()).map_err(|e| { + BitFunError::tool(format!( + "[INVALID_PARAMS] interactive_click params invalid: {}", + e + )) + })?; + let i = p.i; + let res = host.interactive_click(app.clone(), p).await?; + let data = build_interactive_action_json( + &app, + &res, + json!({ "i": i, "action": "interactive_click" }), + ); + let summary = format!("interactive_click i={}", i); + Ok(vec![interactive_action_result(data, Some(summary), &res)]) + } + "build_visual_mark_view" => { + let app = parse_selector(params)?; + let opts: VisualMarkViewOpts = match params.get("opts") { + Some(v) if !v.is_null() => serde_json::from_value(v.clone()).map_err(|e| { + BitFunError::tool(format!( + "[INVALID_PARAMS] build_visual_mark_view 'opts' invalid: {}", + e + )) + })?, + _ => VisualMarkViewOpts::default(), + }; + let view = host.build_visual_mark_view(app.clone(), opts).await?; + let view_json = build_visual_mark_view_json(&view); + let summary = format!( + "visual mark view for {} ({} marks, digest={})", + view.app.name, + view.marks.len(), + &view.digest[..view.digest.len().min(12)] + ); + Ok(vec![visual_mark_view_result( + view_json, + Some(summary), + &view, + )]) + } + "visual_click" => { + let app = parse_selector(params)?; + let p: VisualClickParams = serde_json::from_value(params.clone()).map_err(|e| { + BitFunError::tool(format!( + "[INVALID_PARAMS] visual_click params invalid: {}", + e + )) + })?; + let i = p.i; + let res = host.visual_click(app.clone(), p).await?; + let data = build_visual_action_json( + &app, + &res, + json!({ "i": i, "action": "visual_click" }), + ); + let summary = format!("visual_click i={}", i); + Ok(vec![visual_action_result(data, Some(summary), &res)]) + } + "interactive_type_text" => { + let app = parse_selector(params)?; + let p: InteractiveTypeTextParams = + serde_json::from_value(params.clone()).map_err(|e| { + BitFunError::tool(format!( + "[INVALID_PARAMS] interactive_type_text params invalid: {}", + e + )) + })?; + let i = p.i; + let text_len = p.text.chars().count(); + let res = host.interactive_type_text(app.clone(), p).await?; + let data = build_interactive_action_json( + &app, + &res, + json!({ + "i": i, + "action": "interactive_type_text", + "text_chars": text_len, + }), + ); + let summary = match i { + Some(idx) => format!("interactive_type_text i={} ({} chars)", idx, text_len), + None => format!("interactive_type_text focused ({} chars)", text_len), + }; + Ok(vec![interactive_action_result(data, Some(summary), &res)]) + } + "interactive_scroll" => { + let app = parse_selector(params)?; + let p: InteractiveScrollParams = + serde_json::from_value(params.clone()).map_err(|e| { + BitFunError::tool(format!( + "[INVALID_PARAMS] interactive_scroll params invalid: {}", + e + )) + })?; + let (i, dx, dy) = (p.i, p.dx, p.dy); + let res = host.interactive_scroll(app.clone(), p).await?; + let data = build_interactive_action_json( + &app, + &res, + json!({ + "i": i, + "dx": dx, + "dy": dy, + "action": "interactive_scroll", + }), + ); + let summary = format!("interactive_scroll i={:?} dx={} dy={}", i, dx, dy); + Ok(vec![interactive_action_result(data, Some(summary), &res)]) + } + other => Err(BitFunError::tool(format!( + "[INTERNAL] handle_desktop_ax called with unknown action: {}", + other + ))), + } + } + // ── Browser domain ───────────────────────────────────────────────── async fn handle_browser(&self, action: &str, params: &Value) -> BitFunResult> { diff --git a/src/crates/core/src/lib.rs b/src/crates/core/src/lib.rs index c5c462498..9aa1bdd8b 100644 --- a/src/crates/core/src/lib.rs +++ b/src/crates/core/src/lib.rs @@ -1,4 +1,5 @@ #![allow(non_snake_case)] +#![recursion_limit = "256"] // BitFun Core Library - Platform-agnostic business logic // Four-layer architecture: Util -> Infrastructure -> Service -> Agentic