From f4b29ff58debbc9c052bf9eab449c58b193c724c Mon Sep 17 00:00:00 2001 From: Oleh Luchkiv Date: Tue, 18 Nov 2025 17:53:15 -0600 Subject: [PATCH 1/3] Added iframe capturing into the API --- agent-server/README.md | 86 ++++++++++++ agent-server/nodejs/CLAUDE.md | 107 ++++++++++++++- agent-server/nodejs/src/api-server.js | 17 ++- .../nodejs/src/lib/BrowserAgentServer.js | 128 ++++++++++++++++-- 4 files changed, 320 insertions(+), 18 deletions(-) diff --git a/agent-server/README.md b/agent-server/README.md index 2fdddd0c9a..653065c50c 100644 --- a/agent-server/README.md +++ b/agent-server/README.md @@ -210,6 +210,92 @@ Get HTML or text content of a page. } ``` +#### `POST /page/execute` + +Execute JavaScript code in the context of a specific browser tab via Chrome DevTools Protocol. + +**Request:** +```json +{ + "clientId": "9907fd8d-92a8-4a6a-bce9-458ec8c57306", + "tabId": "482D56EE57B1931A3B9D1BFDAF935429", + "expression": "document.title", + "returnByValue": true, + "awaitPromise": false +} +``` + +**Parameters:** +- `clientId` (required): The client ID from `/v1/responses` metadata +- `tabId` (required): The tab ID from `/v1/responses` metadata +- `expression` (required): JavaScript code to execute (string) +- `returnByValue` (optional, default: `true`): Whether to return result by value or as object reference +- `awaitPromise` (optional, default: `false`): Whether to await if the result is a Promise + +**Response:** +```json +{ + "clientId": "9907fd8d-92a8-4a6a-bce9-458ec8c57306", + "tabId": "482D56EE57B1931A3B9D1BFDAF935429", + "result": { + "type": "string", + "value": "Example Page Title" + }, + "exceptionDetails": null, + "timestamp": 1234567890 +} +``` + +**Response Fields:** +- `clientId`: Base client ID (without tab suffix) +- `tabId`: The tab ID where JavaScript was executed +- `result`: CDP `Runtime.evaluate` result object containing: + - `type`: Result type (string, number, object, etc.) + - `value`: The actual value (if `returnByValue: true`) +- `exceptionDetails`: Error details if execution failed, otherwise `null` +- `timestamp`: Unix timestamp in milliseconds + +**Example Usage:** + +```bash +# Get page title +curl -X POST http://localhost:8080/page/execute \ + -H "Content-Type: application/json" \ + -d '{ + "clientId": "9907fd8d-92a8-4a6a-bce9-458ec8c57306", + "tabId": "482D56EE57B1931A3B9D1BFDAF935429", + "expression": "document.title" + }' + +# Count elements +curl -X POST http://localhost:8080/page/execute \ + -H "Content-Type: application/json" \ + -d '{ + "clientId": "9907fd8d-92a8-4a6a-bce9-458ec8c57306", + "tabId": "482D56EE57B1931A3B9D1BFDAF935429", + "expression": "document.querySelectorAll(\"button\").length" + }' + +# Execute async code with await +curl -X POST http://localhost:8080/page/execute \ + -H "Content-Type: application/json" \ + -d '{ + "clientId": "9907fd8d-92a8-4a6a-bce9-458ec8c57306", + "tabId": "482D56EE57B1931A3B9D1BFDAF935429", + "expression": "fetch(\"https://api.example.com/data\").then(r => r.json())", + "awaitPromise": true + }' +``` + +**Use Cases:** +- Extract specific data from the page (e.g., element counts, text content) +- Verify JavaScript state/variables for evaluations +- Check DOM state programmatically +- Execute custom validation logic +- Interact with page APIs directly + +This endpoint complements `/page/content` by allowing precise JavaScript execution rather than just fetching full HTML/text content. + #### `POST /tabs/open` Open a new browser tab. diff --git a/agent-server/nodejs/CLAUDE.md b/agent-server/nodejs/CLAUDE.md index 267cf3e791..de04b5c0b6 100644 --- a/agent-server/nodejs/CLAUDE.md +++ b/agent-server/nodejs/CLAUDE.md @@ -39,7 +39,7 @@ The eval-server is a **thin HTTP API wrapper for Browser Operator**. It provides ### HTTP API Server (src/api-server.js) - Exposes REST endpoints for external callers (e.g., Python evals) - Main endpoint: `POST /v1/responses` - Send task to agent -- CDP endpoints: screenshot, page content, tab management +- CDP endpoints: screenshot, page content, JavaScript execution, tab management - Returns metadata (clientId, tabId) for subsequent operations ### RPC Client (src/rpc-client.js) @@ -57,6 +57,7 @@ The eval-server is a **thin HTTP API wrapper for Browser Operator**. It provides - Direct Chrome DevTools Protocol communication - Screenshot capture via `Page.captureScreenshot` - Page content access via `Runtime.evaluate` +- JavaScript execution via `Runtime.evaluate` (with configurable options) - Tab management via `Target.createTarget` / `Target.closeTarget` ### Logger (src/logger.js) @@ -212,6 +213,109 @@ Get HTML or text content of a page. } ``` +**Response:** +```json +{ + "clientId": "9907fd8d-92a8-4a6a-bce9-458ec8c57306", + "tabId": "482D56EE57B1931A3B9D1BFDAF935429", + "content": "...", + "format": "html", + "length": 12345, + "timestamp": 1234567890 +} +``` + +### POST /page/execute + +Execute JavaScript code in the context of a specific browser tab via Chrome DevTools Protocol. + +**Request:** +```json +{ + "clientId": "9907fd8d-92a8-4a6a-bce9-458ec8c57306", + "tabId": "482D56EE57B1931A3B9D1BFDAF935429", + "expression": "document.title", + "returnByValue": true, + "awaitPromise": false +} +``` + +**Parameters:** +- `clientId` (required): The client ID from `/v1/responses` metadata +- `tabId` (required): The tab ID from `/v1/responses` metadata +- `expression` (required): JavaScript code to execute (string) +- `returnByValue` (optional, default: `true`): Whether to return result by value or as object reference +- `awaitPromise` (optional, default: `false`): Whether to await if the result is a Promise + +**Response:** +```json +{ + "clientId": "9907fd8d-92a8-4a6a-bce9-458ec8c57306", + "tabId": "482D56EE57B1931A3B9D1BFDAF935429", + "result": { + "type": "string", + "value": "Example Page Title" + }, + "exceptionDetails": null, + "timestamp": 1234567890 +} +``` + +**Response Fields:** +- `clientId`: Base client ID (without tab suffix) +- `tabId`: The tab ID where JavaScript was executed +- `result`: CDP `Runtime.evaluate` result object containing: + - `type`: Result type (string, number, object, etc.) + - `value`: The actual value (if `returnByValue: true`) +- `exceptionDetails`: Error details if execution failed, otherwise `null` +- `timestamp`: Unix timestamp in milliseconds + +**Implementation:** +- Uses CDP `Runtime.evaluate` via `browserAgentServer.evaluateJavaScript()` +- Executes code in the page's main JavaScript context +- First 100 characters of expression logged for debugging + +**Example Usage:** + +```bash +# Get page title +curl -X POST http://localhost:8080/page/execute \ + -H "Content-Type: application/json" \ + -d '{ + "clientId": "9907fd8d-92a8-4a6a-bce9-458ec8c57306", + "tabId": "482D56EE57B1931A3B9D1BFDAF935429", + "expression": "document.title" + }' + +# Count elements +curl -X POST http://localhost:8080/page/execute \ + -H "Content-Type: application/json" \ + -d '{ + "clientId": "9907fd8d-92a8-4a6a-bce9-458ec8c57306", + "tabId": "482D56EE57B1931A3B9D1BFDAF935429", + "expression": "document.querySelectorAll(\"button\").length" + }' + +# Execute async code with await +curl -X POST http://localhost:8080/page/execute \ + -H "Content-Type: application/json" \ + -d '{ + "clientId": "9907fd8d-92a8-4a6a-bce9-458ec8c57306", + "tabId": "482D56EE57B1931A3B9D1BFDAF935429", + "expression": "fetch(\"https://api.example.com/data\").then(r => r.json())", + "awaitPromise": true + }' +``` + +**Use Cases:** +- Extract specific data from the page (e.g., element counts, text content) +- Verify JavaScript state/variables for evaluations +- Check DOM state programmatically +- Execute custom validation logic +- Interact with page APIs directly + +This endpoint complements `/page/content` by allowing precise JavaScript execution rather than just fetching full HTML/text content. + ### POST /tabs/open, POST /tabs/close Tab management via CDP. @@ -412,5 +516,6 @@ Removed dependencies: - ✅ HTTP REST API endpoints - ✅ CDP screenshot capture - ✅ CDP page content retrieval +- ✅ CDP JavaScript execution - ✅ CDP tab management - ✅ Return metadata (clientId, tabId) for screenshot capture diff --git a/agent-server/nodejs/src/api-server.js b/agent-server/nodejs/src/api-server.js index afb9acee1e..9a837bbd26 100644 --- a/agent-server/nodejs/src/api-server.js +++ b/agent-server/nodejs/src/api-server.js @@ -284,7 +284,7 @@ class APIServer { } async getPageContent(payload) { - const { clientId, tabId, format = 'html' } = payload; + const { clientId, tabId, format = 'html', includeIframes = false } = payload; if (!clientId) { throw new Error('Client ID is required'); @@ -300,14 +300,14 @@ class APIServer { const baseClientId = clientId.split(':')[0]; - logger.info('Getting page content', { baseClientId, tabId, format }); + logger.info('Getting page content', { baseClientId, tabId, format, includeIframes }); // Call appropriate method based on format const result = format === 'html' - ? await this.browserAgentServer.getPageHTML(tabId) - : await this.browserAgentServer.getPageText(tabId); + ? await this.browserAgentServer.getPageHTML(tabId, { includeIframes }) + : await this.browserAgentServer.getPageText(tabId, { includeIframes }); - return { + const response = { clientId: baseClientId, tabId: result.tabId, content: result.content, @@ -315,6 +315,13 @@ class APIServer { length: result.length, timestamp: Date.now() }; + + // Include frame count if iframes were captured + if (result.frameCount !== undefined) { + response.frameCount = result.frameCount; + } + + return response; } async getScreenshot(payload) { diff --git a/agent-server/nodejs/src/lib/BrowserAgentServer.js b/agent-server/nodejs/src/lib/BrowserAgentServer.js index 736ce8454e..47ecc88e68 100644 --- a/agent-server/nodejs/src/lib/BrowserAgentServer.js +++ b/agent-server/nodejs/src/lib/BrowserAgentServer.js @@ -1110,28 +1110,50 @@ export class BrowserAgentServer extends EventEmitter { * @param {string} tabId - Tab ID (target ID) * @returns {Promise} Result with HTML content */ - async getPageHTML(tabId) { + async getPageHTML(tabId, options = {}) { + const { includeIframes = false } = options; + try { - logger.info('Getting page HTML via CDP', { tabId }); + logger.info('Getting page HTML via CDP', { tabId, includeIframes }); - // Use Runtime.evaluate to get document.documentElement.outerHTML - const result = await this.sendCDPCommandToTarget(tabId, 'Runtime.evaluate', { - expression: 'document.documentElement.outerHTML', - returnByValue: true - }); + if (!includeIframes) { + // Original behavior - main frame only + const result = await this.sendCDPCommandToTarget(tabId, 'Runtime.evaluate', { + expression: 'document.documentElement.outerHTML', + returnByValue: true + }); + + const html = result.result.value; + + logger.info('Page HTML retrieved successfully', { + tabId, + length: html.length + }); + + return { + tabId, + content: html, + format: 'html', + length: html.length + }; + } - const html = result.result.value; + // Enhanced behavior - include all frames + const frameTree = await this.sendCDPCommandToTarget(tabId, 'Page.getFrameTree', {}); + const allFrameHTML = await this.captureAllFramesHTML(tabId, frameTree.frameTree); - logger.info('Page HTML retrieved successfully', { + logger.info('Page HTML with iframes retrieved successfully', { tabId, - length: html.length + length: allFrameHTML.length, + frameCount: this.countFrames(frameTree.frameTree) }); return { tabId, - content: html, + content: allFrameHTML, format: 'html', - length: html.length + length: allFrameHTML.length, + frameCount: this.countFrames(frameTree.frameTree) }; } catch (error) { logger.error('Failed to get page HTML via CDP', { @@ -1142,6 +1164,88 @@ export class BrowserAgentServer extends EventEmitter { } } + /** + * Recursively capture HTML from all frames + * @param {string} tabId - Tab ID + * @param {Object} frameTree - Frame tree from Page.getFrameTree + * @returns {Promise} Combined HTML from all frames + */ + async captureAllFramesHTML(tabId, frameTree) { + const frames = []; + + // Helper to recursively collect frames + const collectFrames = (node) => { + frames.push(node.frame); + if (node.childFrames) { + node.childFrames.forEach(collectFrames); + } + }; + + collectFrames(frameTree); + + logger.info('Collecting HTML from frames', { tabId, frameCount: frames.length }); + + // Capture HTML from each frame + const htmlParts = []; + + for (const frame of frames) { + try { + const result = await this.sendCDPCommandToTarget(tabId, 'Runtime.evaluate', { + expression: 'document.documentElement.outerHTML', + returnByValue: true, + contextId: frame.id // Execute in specific frame context + }); + + htmlParts.push({ + frameId: frame.id, + url: frame.url, + html: result.result.value + }); + + logger.info('Captured frame HTML', { + tabId, + frameId: frame.id, + url: frame.url, + length: result.result.value.length + }); + } catch (error) { + logger.warn('Failed to capture frame HTML', { + tabId, + frameId: frame.id, + error: error.message + }); + } + } + + // Combine HTML with frame markers + if (htmlParts.length === 0) { + throw new Error('No frames captured'); + } + + let combined = htmlParts[0].html; // Main frame + + for (let i = 1; i < htmlParts.length; i++) { + combined += `\n\n\n${htmlParts[i].html}`; + } + + return combined; + } + + /** + * Count total frames in frame tree + * @param {Object} frameTree - Frame tree node + * @returns {number} Total frame count + */ + countFrames(frameTree) { + let count = 1; // Current frame + if (frameTree.childFrames) { + frameTree.childFrames.forEach(child => { + count += this.countFrames(child); + }); + } + return count; + } + /** * Get page text content using CDP * @param {string} tabId - Tab ID (target ID) From 1d859b273e83ecf9516e981fd03ecfce6ba765f2 Mon Sep 17 00:00:00 2001 From: Oleh Luchkiv Date: Wed, 19 Nov 2025 18:27:52 -0600 Subject: [PATCH 2/3] Capture iframes when retrieving page snapshot through agent-server API --- .../nodejs/src/lib/BrowserAgentServer.js | 35 +++++++++++++++---- 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/agent-server/nodejs/src/lib/BrowserAgentServer.js b/agent-server/nodejs/src/lib/BrowserAgentServer.js index 47ecc88e68..2e7a99ccfe 100644 --- a/agent-server/nodejs/src/lib/BrowserAgentServer.js +++ b/agent-server/nodejs/src/lib/BrowserAgentServer.js @@ -1188,13 +1188,35 @@ export class BrowserAgentServer extends EventEmitter { // Capture HTML from each frame const htmlParts = []; - for (const frame of frames) { + for (let i = 0; i < frames.length; i++) { + const frame = frames[i]; try { - const result = await this.sendCDPCommandToTarget(tabId, 'Runtime.evaluate', { - expression: 'document.documentElement.outerHTML', - returnByValue: true, - contextId: frame.id // Execute in specific frame context - }); + // For the main frame (first frame), use Runtime.evaluate without frameId + // For child frames, we need to create an execution context in that frame + + let result; + if (i === 0) { + // Main frame - simple evaluation + result = await this.sendCDPCommandToTarget(tabId, 'Runtime.evaluate', { + expression: 'document.documentElement.outerHTML', + returnByValue: true + }); + } else { + // Child frame - need to create isolated world in the frame's context + // First, create an execution context in this frame + const contextResult = await this.sendCDPCommandToTarget(tabId, 'Page.createIsolatedWorld', { + frameId: frame.id, + grantUniversalAccess: true, + worldName: 'iframe-capture' + }); + + // Now evaluate in that context + result = await this.sendCDPCommandToTarget(tabId, 'Runtime.evaluate', { + expression: 'document.documentElement.outerHTML', + returnByValue: true, + contextId: contextResult.executionContextId + }); + } htmlParts.push({ frameId: frame.id, @@ -1212,6 +1234,7 @@ export class BrowserAgentServer extends EventEmitter { logger.warn('Failed to capture frame HTML', { tabId, frameId: frame.id, + url: frame.url, error: error.message }); } From ba87432dc01dccc9231da96f5d474af5f3af2542 Mon Sep 17 00:00:00 2001 From: Oleh Luchkiv Date: Sat, 22 Nov 2025 10:02:15 -0600 Subject: [PATCH 3/3] Minor fixes --- agent-server/nodejs/CLAUDE.md | 13 ++++++++++++- agent-server/nodejs/src/lib/BrowserAgentServer.js | 6 +++--- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/agent-server/nodejs/CLAUDE.md b/agent-server/nodejs/CLAUDE.md index de04b5c0b6..c2330afd42 100644 --- a/agent-server/nodejs/CLAUDE.md +++ b/agent-server/nodejs/CLAUDE.md @@ -209,10 +209,17 @@ Get HTML or text content of a page. { "clientId": "9907fd8d-92a8-4a6a-bce9-458ec8c57306", "tabId": "482D56EE57B1931A3B9D1BFDAF935429", - "format": "html" + "format": "html", + "includeIframes": true } ``` +**Parameters:** +- `clientId` (required): The client ID from `/v1/responses` metadata +- `tabId` (required): The tab ID from `/v1/responses` metadata +- `format` (optional, default: `"html"`): Content format - either `"html"` or `"text"` +- `includeIframes` (optional, default: `false`): Whether to include HTML content from iframes. When `true`, recursively captures content from all iframe elements on the page. + **Response:** ```json { @@ -221,10 +228,14 @@ Get HTML or text content of a page. "content": "...", "format": "html", "length": 12345, + "frameCount": 3, "timestamp": 1234567890 } ``` +**Response fields:** +- `frameCount` (number, optional): Number of frames included in the content. Only present when `includeIframes: true` is used. + ### POST /page/execute Execute JavaScript code in the context of a specific browser tab via Chrome DevTools Protocol. diff --git a/agent-server/nodejs/src/lib/BrowserAgentServer.js b/agent-server/nodejs/src/lib/BrowserAgentServer.js index 2e7a99ccfe..a9e181c67b 100644 --- a/agent-server/nodejs/src/lib/BrowserAgentServer.js +++ b/agent-server/nodejs/src/lib/BrowserAgentServer.js @@ -1166,8 +1166,8 @@ export class BrowserAgentServer extends EventEmitter { /** * Recursively capture HTML from all frames - * @param {string} tabId - Tab ID - * @param {Object} frameTree - Frame tree from Page.getFrameTree + * @param {string} tabId Tab ID + * @param {Object} frameTree Frame tree from Page.getFrameTree * @returns {Promise} Combined HTML from all frames */ async captureAllFramesHTML(tabId, frameTree) { @@ -1256,7 +1256,7 @@ export class BrowserAgentServer extends EventEmitter { /** * Count total frames in frame tree - * @param {Object} frameTree - Frame tree node + * @param {Object} frameTree Frame tree node * @returns {number} Total frame count */ countFrames(frameTree) {