From 80075775b26dae1ba428a1113bdc616bc42c9cbc Mon Sep 17 00:00:00 2001 From: James Grugett Date: Thu, 21 May 2026 12:50:29 -0700 Subject: [PATCH] Add read_url tool for web research --- agents/context-pruner.ts | 4 + agents/researcher/researcher-web.ts | 8 +- agents/types/agent-definition.ts | 2 +- agents/types/tools.ts | 12 + .../templates/initial-agents-dir/README.md | 1 + .../types/agent-definition.ts | 2 +- .../initial-agents-dir/types/tools.ts | 12 + common/src/tools/constants.ts | 2 + common/src/tools/list.ts | 6 + common/src/tools/params/tool/read-url.ts | 81 ++++ .../agent-runtime/src/tools/handlers/list.ts | 2 + .../src/tools/handlers/tool/read-url.ts | 21 + .../agent-runtime/src/tools/tool-executor.ts | 1 + sdk/src/__tests__/read-url.test.ts | 229 ++++++++++ .../researcher-web.integration.test.ts | 79 +++- sdk/src/run.ts | 3 + sdk/src/tools/read-url.ts | 413 ++++++++++++++++++ 17 files changed, 870 insertions(+), 8 deletions(-) create mode 100644 common/src/tools/params/tool/read-url.ts create mode 100644 packages/agent-runtime/src/tools/handlers/tool/read-url.ts create mode 100644 sdk/src/__tests__/read-url.test.ts create mode 100644 sdk/src/tools/read-url.ts diff --git a/agents/context-pruner.ts b/agents/context-pruner.ts index f60b569d9a..1a333a8f09 100644 --- a/agents/context-pruner.ts +++ b/agents/context-pruner.ts @@ -307,6 +307,10 @@ const definition: AgentDefinition = { ? `web search for "${query}"` : 'web search' } + case 'read_url': { + const url = input.url as string | undefined + return url ? `read URL: ${url}` : 'read a URL' + } case 'gravity_index': { const query = input.query as string | undefined const action = input.action as string | undefined diff --git a/agents/researcher/researcher-web.ts b/agents/researcher/researcher-web.ts index 28b1027689..3be3071928 100644 --- a/agents/researcher/researcher-web.ts +++ b/agents/researcher/researcher-web.ts @@ -16,15 +16,17 @@ const definition: SecretAgentDefinition = { }, outputMode: 'last_message', includeMessageHistory: false, - toolNames: ['web_search', 'run_terminal_command'], + toolNames: ['web_search', 'read_url'], spawnableAgents: [], - systemPrompt: `You are an expert researcher who can search the web to find relevant information. Your goal is to answer the user's question from current search results and any useful source pages. Use web_search to get Serper JSON search results. Use run_terminal_command with tools like curl to fetch web pages that would help answer the user's question.`, + systemPrompt: `You are an expert researcher who can search the web to find relevant information. Your goal is to answer the user's question from current search results and useful source pages. Use web_search to get Serper JSON search results. Use read_url to fetch and extract readable text from pages that would help answer the user's question.`, instructionsPrompt: `Provide comprehensive research on the user's prompt. Use web_search to find current information. The tool returns JSON search results, so inspect the titles, links, snippets, answer boxes, and related results before deciding what to fetch next. -Use run_terminal_command to fetch any web page that would help answer the user's question. Prefer targeted, relevant pages from the search results. Avoid fetching pages that are unlikely to add useful evidence. +Use read_url to fetch any web page that would help answer the user's question. Prefer targeted, relevant pages from the search results, especially official or primary sources. Avoid fetching pages that are unlikely to add useful evidence. + +If read_url cannot handle a source, choose a different result or explain the limitation. Then, write up a concise answer that includes key findings for the user's prompt and cites source URLs when useful. `.trim(), diff --git a/agents/types/agent-definition.ts b/agents/types/agent-definition.ts index 030de3a14f..b2b157ab09 100644 --- a/agents/types/agent-definition.ts +++ b/agents/types/agent-definition.ts @@ -345,7 +345,7 @@ export type TerminalTools = 'run_terminal_command' | 'code_search' /** * Web and browser tools */ -export type WebTools = 'web_search' | 'read_docs' +export type WebTools = 'web_search' | 'read_docs' | 'read_url' /** * Agent management tools diff --git a/agents/types/tools.ts b/agents/types/tools.ts index c3b627859e..b330950757 100644 --- a/agents/types/tools.ts +++ b/agents/types/tools.ts @@ -17,6 +17,7 @@ export type ToolName = | 'read_docs' | 'read_files' | 'read_subtree' + | 'read_url' | 'render_ui' | 'run_file_change_hooks' | 'run_terminal_command' @@ -51,6 +52,7 @@ export interface ToolParamsMap { read_docs: ReadDocsParams read_files: ReadFilesParams read_subtree: ReadSubtreeParams + read_url: ReadUrlParams render_ui: RenderUiParams run_file_change_hooks: RunFileChangeHooksParams run_terminal_command: RunTerminalCommandParams @@ -276,6 +278,16 @@ export interface ReadSubtreeParams { maxTokens?: number } +/** + * Fetch a URL and extract readable text from the page. + */ +export interface ReadUrlParams { + /** The full http:// or https:// URL to fetch and extract readable text from. */ + url: string + /** Maximum number of extracted text characters to return. Defaults to 20000. */ + max_chars?: number +} + /** * Render a small interactive UI widget in the Codebuff CLI. Currently supports a button that opens a link. */ diff --git a/common/src/templates/initial-agents-dir/README.md b/common/src/templates/initial-agents-dir/README.md index c02ddab90a..43053980d3 100644 --- a/common/src/templates/initial-agents-dir/README.md +++ b/common/src/templates/initial-agents-dir/README.md @@ -132,6 +132,7 @@ export default { ### Web & Research - **`web_search`**: Search the internet for information +- **`read_url`**: Fetch a URL and extract readable page text - **`read_docs`**: Read technical documentation - **`browser_logs`**: Navigate and inspect web pages diff --git a/common/src/templates/initial-agents-dir/types/agent-definition.ts b/common/src/templates/initial-agents-dir/types/agent-definition.ts index 030de3a14f..b2b157ab09 100644 --- a/common/src/templates/initial-agents-dir/types/agent-definition.ts +++ b/common/src/templates/initial-agents-dir/types/agent-definition.ts @@ -345,7 +345,7 @@ export type TerminalTools = 'run_terminal_command' | 'code_search' /** * Web and browser tools */ -export type WebTools = 'web_search' | 'read_docs' +export type WebTools = 'web_search' | 'read_docs' | 'read_url' /** * Agent management tools diff --git a/common/src/templates/initial-agents-dir/types/tools.ts b/common/src/templates/initial-agents-dir/types/tools.ts index c3b627859e..b330950757 100644 --- a/common/src/templates/initial-agents-dir/types/tools.ts +++ b/common/src/templates/initial-agents-dir/types/tools.ts @@ -17,6 +17,7 @@ export type ToolName = | 'read_docs' | 'read_files' | 'read_subtree' + | 'read_url' | 'render_ui' | 'run_file_change_hooks' | 'run_terminal_command' @@ -51,6 +52,7 @@ export interface ToolParamsMap { read_docs: ReadDocsParams read_files: ReadFilesParams read_subtree: ReadSubtreeParams + read_url: ReadUrlParams render_ui: RenderUiParams run_file_change_hooks: RunFileChangeHooksParams run_terminal_command: RunTerminalCommandParams @@ -276,6 +278,16 @@ export interface ReadSubtreeParams { maxTokens?: number } +/** + * Fetch a URL and extract readable text from the page. + */ +export interface ReadUrlParams { + /** The full http:// or https:// URL to fetch and extract readable text from. */ + url: string + /** Maximum number of extracted text characters to return. Defaults to 20000. */ + max_chars?: number +} + /** * Render a small interactive UI widget in the Codebuff CLI. Currently supports a button that opens a link. */ diff --git a/common/src/tools/constants.ts b/common/src/tools/constants.ts index b34f890bcd..5fe789eb76 100644 --- a/common/src/tools/constants.ts +++ b/common/src/tools/constants.ts @@ -39,6 +39,7 @@ export const toolNames = [ 'read_docs', 'read_files', 'read_subtree', + 'read_url', 'render_ui', 'run_file_change_hooks', 'run_terminal_command', @@ -73,6 +74,7 @@ export const publishedTools = [ 'read_docs', 'read_files', 'read_subtree', + 'read_url', 'render_ui', 'run_file_change_hooks', 'run_terminal_command', diff --git a/common/src/tools/list.ts b/common/src/tools/list.ts index 9b3d3ba687..4f40570d0e 100644 --- a/common/src/tools/list.ts +++ b/common/src/tools/list.ts @@ -19,6 +19,7 @@ import { proposeWriteFileParams } from './params/tool/propose-write-file' import { readDocsParams } from './params/tool/read-docs' import { readFilesParams } from './params/tool/read-files' import { readSubtreeParams } from './params/tool/read-subtree' +import { readUrlParams } from './params/tool/read-url' import { renderUIParams } from './params/tool/render-ui' import { runFileChangeHooksParams } from './params/tool/run-file-change-hooks' import { runTerminalCommandParams } from './params/tool/run-terminal-command' @@ -59,6 +60,7 @@ export const toolParams = { read_docs: readDocsParams, read_files: readFilesParams, read_subtree: readSubtreeParams, + read_url: readUrlParams, render_ui: renderUIParams, run_file_change_hooks: runFileChangeHooksParams, run_terminal_command: runTerminalCommandParams, @@ -131,6 +133,10 @@ export const clientToolCallSchema = z.discriminatedUnion('toolName', [ toolName: z.literal('run_file_change_hooks'), input: toolParams.run_file_change_hooks.inputSchema, }), + z.object({ + toolName: z.literal('read_url'), + input: toolParams.read_url.inputSchema, + }), z.object({ toolName: z.literal('run_terminal_command'), input: toolParams.run_terminal_command.inputSchema.and( diff --git a/common/src/tools/params/tool/read-url.ts b/common/src/tools/params/tool/read-url.ts new file mode 100644 index 0000000000..fc7069d65a --- /dev/null +++ b/common/src/tools/params/tool/read-url.ts @@ -0,0 +1,81 @@ +import z from 'zod/v4' + +import { $getNativeToolCallExampleString, jsonToolResultSchema } from '../utils' + +import type { $ToolParams } from '../../constants' + +const toolName = 'read_url' +const endsAgentStep = true +const inputSchema = z + .object({ + url: z + .url() + .refine((value) => { + try { + const parsedUrl = new URL(value) + return ( + parsedUrl.protocol === 'http:' || parsedUrl.protocol === 'https:' + ) + } catch { + return false + } + }, 'URL must use http:// or https://') + .describe( + 'The full http:// or https:// URL to fetch and extract readable text from.', + ), + max_chars: z + .number() + .int() + .min(1_000) + .max(50_000) + .default(20_000) + .optional() + .describe( + 'Maximum number of extracted text characters to return. Defaults to 20000.', + ), + }) + .describe('Fetch a URL and extract readable text from the page.') + +const description = ` +Purpose: Fetch a URL returned by web_search and extract the readable page text so you can answer with source-backed evidence. + +Use this after web_search when snippets are not enough. Prefer authoritative, relevant pages from the search results. The tool follows redirects, extracts titles and metadata, strips scripts/styles/navigation boilerplate from HTML, and returns normalized readable text. + +Do not use run_terminal_command with curl just to inspect web pages; use read_url instead. If read_url reports unsupported content or extraction failure, then choose a different search result or explain the limitation. + +Example: +${$getNativeToolCallExampleString({ + toolName, + inputSchema, + input: { + url: 'https://react.dev/reference/react/useActionState', + max_chars: 12000, + }, + endsAgentStep, +})} +`.trim() + +export const readUrlParams = { + toolName, + endsAgentStep, + description, + inputSchema, + outputSchema: jsonToolResultSchema( + z.union([ + z.object({ + url: z.string(), + finalUrl: z.string(), + status: z.number(), + contentType: z.string().optional(), + title: z.string().optional(), + description: z.string().optional(), + text: z.string(), + truncated: z.boolean(), + }), + z.object({ + url: z.string().optional(), + errorMessage: z.string(), + }), + ]), + ), +} satisfies $ToolParams diff --git a/packages/agent-runtime/src/tools/handlers/list.ts b/packages/agent-runtime/src/tools/handlers/list.ts index 32df1f6784..abb7c340db 100644 --- a/packages/agent-runtime/src/tools/handlers/list.ts +++ b/packages/agent-runtime/src/tools/handlers/list.ts @@ -16,6 +16,7 @@ import { handleProposeWriteFile } from './tool/propose-write-file' import { handleReadDocs } from './tool/read-docs' import { handleReadFiles } from './tool/read-files' import { handleReadSubtree } from './tool/read-subtree' +import { handleReadUrl } from './tool/read-url' import { handleRenderUI } from './tool/render-ui' import { handleRunFileChangeHooks } from './tool/run-file-change-hooks' import { handleRunTerminalCommand } from './tool/run-terminal-command' @@ -64,6 +65,7 @@ export const codebuffToolHandlers = { read_docs: handleReadDocs, read_files: handleReadFiles, read_subtree: handleReadSubtree, + read_url: handleReadUrl, render_ui: handleRenderUI, run_file_change_hooks: handleRunFileChangeHooks, run_terminal_command: handleRunTerminalCommand, diff --git a/packages/agent-runtime/src/tools/handlers/tool/read-url.ts b/packages/agent-runtime/src/tools/handlers/tool/read-url.ts new file mode 100644 index 0000000000..032d39612c --- /dev/null +++ b/packages/agent-runtime/src/tools/handlers/tool/read-url.ts @@ -0,0 +1,21 @@ +import type { CodebuffToolHandlerFunction } from '../handler-function-type' +import type { + ClientToolCall, + CodebuffToolCall, + CodebuffToolOutput, +} from '@codebuff/common/tools/list' + +export const handleReadUrl = (async (params: { + previousToolCallFinished: Promise + toolCall: CodebuffToolCall<'read_url'> + requestClientToolCall: ( + toolCall: ClientToolCall<'read_url'>, + ) => Promise> +}): Promise<{ + output: CodebuffToolOutput<'read_url'> +}> => { + const { previousToolCallFinished, toolCall, requestClientToolCall } = params + + await previousToolCallFinished + return { output: await requestClientToolCall(toolCall) } +}) satisfies CodebuffToolHandlerFunction<'read_url'> diff --git a/packages/agent-runtime/src/tools/tool-executor.ts b/packages/agent-runtime/src/tools/tool-executor.ts index 8fd7130bf5..e6342f405d 100644 --- a/packages/agent-runtime/src/tools/tool-executor.ts +++ b/packages/agent-runtime/src/tools/tool-executor.ts @@ -62,6 +62,7 @@ const bareStringFieldRepairAllowlist: Partial< lookup_agent_info: ['agentId'], read_files: ['paths'], read_subtree: ['paths'], + read_url: ['url'], skill: ['name'], web_search: ['query'], } diff --git a/sdk/src/__tests__/read-url.test.ts b/sdk/src/__tests__/read-url.test.ts new file mode 100644 index 0000000000..4f86aff24f --- /dev/null +++ b/sdk/src/__tests__/read-url.test.ts @@ -0,0 +1,229 @@ +import { describe, expect, it } from 'bun:test' + +import { clientToolCallSchema } from '@codebuff/common/tools/list' + +import { readUrl } from '../tools/read-url' + +const successValue = async ( + html: string, + init?: { + contentType?: string + url?: string + }, +) => { + const fetch = async () => + new Response(html, { + status: 200, + headers: { + 'content-type': init?.contentType ?? 'text/html; charset=utf-8', + }, + }) + + const result = await readUrl({ + url: init?.url ?? 'https://example.com/article', + fetch, + }) + return result[0].value +} + +describe('readUrl', () => { + it('extracts readable HTML text beyond front-loaded boilerplate', async () => { + const boilerplate = Array.from( + { length: 80 }, + (_, index) => `.unused-${index} { color: red; }`, + ).join('\n') + const result = await successValue(` + + + + Research Source + + + + + +
Top navigation should disappear
+
+
+

Important Answer

+

The web researcher should see this useful paragraph.

+

React 19 useActionState returns state, a form action, and pending state.

+
+
+
Footer boilerplate should disappear
+ + + `) + + expect('errorMessage' in result).toBe(false) + if ('errorMessage' in result) return + + expect(result.title).toBe('Research Source') + expect(result.description).toBe('A concise source description.') + expect(result.text).toContain('Important Answer') + expect(result.text).toContain('useActionState returns state') + expect(result.text).not.toContain('.unused-') + expect(result.text).not.toContain('Top navigation') + }) + + it('prefers article content over a larger page main area', async () => { + const result = await successValue(` + + Repository Page + +
+
+

Folders and files

+ ${Array.from( + { length: 40 }, + (_, index) => `file-${index}.ts`, + ).join('')} +
+
+

Project README

+

This is the source content the researcher needs.

+
+
+ + + `) + + expect('errorMessage' in result).toBe(false) + if ('errorMessage' in result) return + + expect(result.text).toContain('Project README') + expect(result.text).toContain('source content') + expect(result.text).not.toContain('Folders and files') + expect(result.text).not.toContain('file-39.ts') + }) + + it('does not add spaces between syntax-highlighted code tokens', async () => { + const result = await successValue(` +
+
const answer=42;
+
+ `) + + expect('errorMessage' in result).toBe(false) + if ('errorMessage' in result) return + + expect(result.text).toContain('const answer=42;') + }) + + it('leaves invalid numeric HTML entities unchanged', async () => { + const result = await successValue( + '

Bad entity: �

', + ) + + expect('errorMessage' in result).toBe(false) + if ('errorMessage' in result) return + + expect(result.text).toContain('Bad entity: �') + }) + + it('rejects non-http URLs', async () => { + const result = await readUrl({ + url: 'file:///etc/passwd', + fetch: async () => { + throw new Error('fetch should not be called') + }, + }) + + expect(result[0].value).toEqual({ + url: 'file:///etc/passwd', + errorMessage: 'Only http:// and https:// URLs are supported', + }) + }) + + it('rejects non-http URLs at the tool schema boundary', () => { + expect(() => + clientToolCallSchema.parse({ + toolName: 'read_url', + input: { url: 'file:///etc/passwd' }, + }), + ).toThrow() + }) + + it('truncates extracted text to max_chars', async () => { + const result = await readUrl({ + url: 'https://example.com/long', + max_chars: 1_000, + fetch: async () => + new Response(`

${'word '.repeat(1_000)}

`, { + status: 200, + headers: { 'content-type': 'text/html' }, + }), + }) + const value = result[0].value + + expect('errorMessage' in value).toBe(false) + if ('errorMessage' in value) return + + expect(value.truncated).toBe(true) + expect(value.text.length).toBeLessThanOrEqual(1_030) + expect(value.text).toContain('[Content truncated]') + }) + + it('returns pretty-printed JSON for JSON responses', async () => { + const result = await successValue('{"name":"Codebuff","answer":42}', { + contentType: 'application/json', + }) + + expect('errorMessage' in result).toBe(false) + if ('errorMessage' in result) return + + expect(result.text).toContain('"name": "Codebuff"') + expect(result.text).toContain('"answer": 42') + }) + + it('supports vendor JSON content types', async () => { + const result = await successValue('{"type":"metadata"}', { + contentType: 'application/ld+json', + }) + + expect('errorMessage' in result).toBe(false) + if ('errorMessage' in result) return + + expect(result.text).toContain('"type": "metadata"') + }) + + it('extracts markdown frontmatter into metadata and omits it from text', async () => { + const result = await successValue( + [ + '---', + 'title: "Readable Docs"', + "description: 'A useful docs page'", + '---', + '# First Heading', + 'Body with · entity.', + ].join('\n'), + { + contentType: 'text/markdown; charset=utf-8', + }, + ) + + expect('errorMessage' in result).toBe(false) + if ('errorMessage' in result) return + + expect(result.title).toBe('Readable Docs') + expect(result.description).toBe('A useful docs page') + expect(result.text.startsWith('# First Heading')).toBe(true) + expect(result.text).toContain('Body with * entity.') + expect(result.text).not.toContain('title:') + }) + + it('supports CRLF markdown frontmatter', async () => { + const result = await successValue( + '---\r\ntitle: CRLF Docs\r\n---\r\n# Body', + { + contentType: 'text/markdown; charset=utf-8', + }, + ) + + expect('errorMessage' in result).toBe(false) + if ('errorMessage' in result) return + + expect(result.title).toBe('CRLF Docs') + expect(result.text).toBe('# Body') + }) +}) diff --git a/sdk/src/__tests__/researcher-web.integration.test.ts b/sdk/src/__tests__/researcher-web.integration.test.ts index d35498bec4..a5e981654a 100644 --- a/sdk/src/__tests__/researcher-web.integration.test.ts +++ b/sdk/src/__tests__/researcher-web.integration.test.ts @@ -63,6 +63,72 @@ function extractOutputText(output: AgentOutput): string { return assistantText.join('\n') } +function summarizeToolTrace(events: PrintModeEvent[]): { + readUrlCount: number + lines: string[] +} { + const lines: string[] = [] + let readUrlCount = 0 + + for (const event of events) { + if (event.type === 'tool_call') { + if (event.toolName === 'web_search') { + lines.push(`tool_call web_search query=${event.input.query}`) + } else if (event.toolName === 'read_url') { + readUrlCount += 1 + lines.push(`tool_call read_url url=${event.input.url}`) + } else { + lines.push(`tool_call ${event.toolName}`) + } + continue + } + + if (event.type !== 'tool_result') continue + + const output = event.output[0] + const value = output?.type === 'json' ? output.value : undefined + if (!value || typeof value !== 'object') { + lines.push(`tool_result ${event.toolName} empty`) + continue + } + + if (event.toolName === 'read_url') { + const result = value as { + url?: string + finalUrl?: string + status?: number + title?: string + text?: string + truncated?: boolean + errorMessage?: string + } + if (result.errorMessage) { + lines.push(`tool_result read_url error=${result.errorMessage}`) + } else { + lines.push( + [ + 'tool_result read_url', + `status=${result.status}`, + `finalUrl=${result.finalUrl}`, + `title=${JSON.stringify(result.title ?? '')}`, + `textChars=${result.text?.length ?? 0}`, + `truncated=${result.truncated ?? false}`, + ].join(' '), + ) + } + } else if (event.toolName === 'web_search') { + const result = value as { result?: string; errorMessage?: string } + lines.push( + result.errorMessage + ? `tool_result web_search error=${result.errorMessage}` + : `tool_result web_search chars=${result.result?.length ?? 0}`, + ) + } + } + + return { readUrlCount, lines } +} + describe('researcher-web SDK integration', () => { it( `runs researcher-web through the SDK and answers with ${EXPECTED_KEYWORD}`, @@ -98,13 +164,21 @@ describe('researcher-web SDK integration', () => { }, prompt: [ 'Use web search to answer this React docs question.', - 'After searching, fetch the most relevant React docs page with run_terminal_command before answering.', + 'After searching, fetch the most relevant React docs page with read_url before answering.', 'In React 19, which hook returns state, a form action, and an isPending value for form actions?', 'Answer with the exact hook name and one short sentence.', ].join(' '), }) const outputText = extractOutputText(result.output) + const trace = summarizeToolTrace(events) + console.log( + [ + 'researcher-web SDK trace:', + ...trace.lines.map((line) => ` ${line}`), + `read_url fetch count: ${trace.readUrlCount}`, + ].join('\n'), + ) console.log('researcher-web SDK output:', outputText) expect(result.output.type).not.toBe('error') @@ -119,8 +193,7 @@ describe('researcher-web SDK integration', () => { expect( events.some( (event) => - event.type === 'tool_call' && - event.toolName === 'run_terminal_command', + event.type === 'tool_call' && event.toolName === 'read_url', ), ).toBe(true) }, diff --git a/sdk/src/run.ts b/sdk/src/run.ts index b492443c39..4014e85449 100644 --- a/sdk/src/run.ts +++ b/sdk/src/run.ts @@ -29,6 +29,7 @@ import { glob } from './tools/glob' import { listDirectory } from './tools/list-directory' import { getProjectPathLookupKeys } from './tools/path-utils' import { getFiles } from './tools/read-files' +import { readUrl } from './tools/read-url' import { runTerminalCommand } from './tools/run-terminal-command' import type { CustomToolDefinition } from './custom-tool' @@ -704,6 +705,8 @@ async function handleToolCall({ cwd: path.resolve(resolvedCwd, input.cwd ?? '.'), env, } as Parameters[0]) + } else if (toolName === 'read_url') { + result = await readUrl(input as Parameters[0]) } else if (toolName === 'code_search') { result = await codeSearch({ projectPath: requireCwd(cwd, 'code_search'), diff --git a/sdk/src/tools/read-url.ts b/sdk/src/tools/read-url.ts new file mode 100644 index 0000000000..9bd5c89f86 --- /dev/null +++ b/sdk/src/tools/read-url.ts @@ -0,0 +1,413 @@ +import type { CodebuffToolOutput } from '../../../common/src/tools/list' + +const DEFAULT_MAX_CHARS = 20_000 +const MAX_RESPONSE_BYTES = 2_000_000 +const FETCH_TIMEOUT_MS = 20_000 +const USER_AGENT = + 'Mozilla/5.0 (compatible; CodebuffResearchBot/1.0; +https://codebuff.com)' + +type ReadUrlOutput = CodebuffToolOutput<'read_url'> +type FetchLike = ( + input: string | URL | Request, + init?: RequestInit, +) => Promise + +function errorResult( + url: string | undefined, + errorMessage: string, +): ReadUrlOutput { + return [{ type: 'json', value: { ...(url ? { url } : {}), errorMessage } }] +} + +function isAllowedUrl(url: URL): boolean { + return url.protocol === 'http:' || url.protocol === 'https:' +} + +function getHeader(headers: Headers, name: string): string | undefined { + return headers.get(name) ?? undefined +} + +async function readResponseBody( + response: Response, + maxBytes: number, +): Promise { + const contentLength = getHeader(response.headers, 'content-length') + if (contentLength && Number(contentLength) > maxBytes) { + throw new Error(`Response is too large (${contentLength} bytes)`) + } + + if (!response.body) { + const buffer = await response.arrayBuffer() + if (buffer.byteLength > maxBytes) { + throw new Error(`Response is too large (${buffer.byteLength} bytes)`) + } + return new TextDecoder().decode(buffer) + } + + const reader = response.body.getReader() + const chunks: Uint8Array[] = [] + let totalBytes = 0 + + while (true) { + const { done, value } = await reader.read() + if (done) break + if (!value) continue + + totalBytes += value.byteLength + if (totalBytes > maxBytes) { + await reader.cancel() + throw new Error(`Response exceeded ${maxBytes} bytes`) + } + chunks.push(value) + } + + const body = new Uint8Array(totalBytes) + let offset = 0 + for (const chunk of chunks) { + body.set(chunk, offset) + offset += chunk.byteLength + } + + return new TextDecoder().decode(body) +} + +function decodeHtmlEntities(text: string): string { + const namedEntities: Record = { + amp: '&', + apos: "'", + copy: '(c)', + hellip: '...', + gt: '>', + lt: '<', + mdash: '-', + middot: '*', + nbsp: ' ', + ndash: '-', + quot: '"', + rsquo: "'", + } + + return text.replace(/&(#x?[0-9a-fA-F]+|[a-zA-Z]+);/g, (entity, body) => { + if (body[0] === '#') { + const isHex = body[1]?.toLowerCase() === 'x' + const value = Number.parseInt(body.slice(isHex ? 2 : 1), isHex ? 16 : 10) + return Number.isFinite(value) && value >= 0 && value <= 0x10ffff + ? String.fromCodePoint(value) + : entity + } + return namedEntities[body] ?? entity + }) +} + +function normalizeText(text: string): string { + return text + .replace(/\r/g, '') + .replace(/[ \t\f\v]+/g, ' ') + .replace(/ *\n */g, '\n') + .replace(/\n{3,}/g, '\n\n') + .split('\n') + .map((line) => line.trim()) + .filter(Boolean) + .join('\n') + .trim() +} + +function extractFirstMatch(html: string, pattern: RegExp): string | undefined { + const match = html.match(pattern) + if (!match?.[1]) return undefined + return normalizeText(decodeHtmlEntities(stripTags(match[1]))) +} + +function stripTags(html: string): string { + return html.replace(/<[^>]*>/g, ' ') +} + +function removeElement(html: string, tagName: string): string { + return html.replace( + new RegExp(`<${tagName}\\b[^>]*>[\\s\\S]*?<\\/${tagName}>`, 'gi'), + '\n', + ) +} + +function extractElementContents(html: string, tagName: string): string[] { + const matches = html.matchAll( + new RegExp(`<${tagName}\\b[^>]*>([\\s\\S]*?)<\\/${tagName}>`, 'gi'), + ) + return Array.from(matches, (match) => match[1]).filter(Boolean) +} + +function selectReadableHtml(html: string): string { + const articleCandidates = extractElementContents(html, 'article') + if (articleCandidates.length > 0) { + return articleCandidates.reduce((best, candidate) => + stripTags(candidate).length > stripTags(best).length ? candidate : best, + ) + } + + const mainCandidates = extractElementContents(html, 'main') + if (mainCandidates.length > 0) { + return mainCandidates.reduce((best, candidate) => + stripTags(candidate).length > stripTags(best).length ? candidate : best, + ) + } + + return html +} + +function extractMetaContent(html: string, name: string): string | undefined { + const escapedName = name.replace(/[.*+?^${}()|[\]\\]/g, '\\$&') + const patterns = [ + new RegExp( + `]*(?:name|property)=["']${escapedName}["'])(?=[^>]*content=["']([^"']*)["'])[^>]*>`, + 'i', + ), + new RegExp( + `]*content=["']([^"']*)["'])(?=[^>]*(?:name|property)=["']${escapedName}["'])[^>]*>`, + 'i', + ), + ] + + for (const pattern of patterns) { + const match = html.match(pattern) + if (match?.[1]) return normalizeText(decodeHtmlEntities(match[1])) + } + return undefined +} + +function extractHtml(html: string): { + title?: string + description?: string + text: string +} { + const title = extractFirstMatch(html, /]*>([\s\S]*?)<\/title>/i) + const description = + extractMetaContent(html, 'description') ?? + extractMetaContent(html, 'og:description') + + let readable = html + .replace(//g, '\n') + .replace(/]*>/gi, '\n') + + for (const tagName of [ + 'script', + 'style', + 'svg', + 'canvas', + 'iframe', + 'noscript', + 'nav', + 'header', + 'footer', + 'form', + 'button', + 'select', + ]) { + readable = removeElement(readable, tagName) + } + + readable = selectReadableHtml(readable) + + readable = readable + .replace(//gi, '\n') + .replace( + /<\/(p|div|section|article|main|aside|li|tr|td|th|h[1-6]|blockquote|pre)>/gi, + '\n', + ) + .replace(/<(li|tr|h[1-6])\b[^>]*>/gi, '\n') + .replace(/<[^>]*>/g, '') + + const text = normalizeText(decodeHtmlEntities(readable)) + return { title, description, text } +} + +function extractMarkdownFrontmatter(body: string): { + title?: string + description?: string + text: string +} { + const match = body.match(/^---\s*\r?\n([\s\S]*?)\r?\n---\s*\r?\n?/) + if (!match) { + return { text: normalizeText(decodeHtmlEntities(body)) } + } + + const frontmatter = match[1] + const getValue = (key: 'title' | 'description') => { + const valueMatch = frontmatter.match( + new RegExp(`^${key}:\\s*(?:"([^"]*)"|'([^']*)'|(.+))\\s*$`, 'm'), + ) + return normalizeText( + decodeHtmlEntities( + valueMatch?.[1] ?? valueMatch?.[2] ?? valueMatch?.[3] ?? '', + ), + ) + } + + return { + title: getValue('title') || undefined, + description: getValue('description') || undefined, + text: normalizeText(decodeHtmlEntities(body.slice(match[0].length))), + } +} + +function isJsonContentType(contentType: string): boolean { + return ( + contentType.includes('application/json') || contentType.includes('+json') + ) +} + +function isMarkdownContentType(contentType: string): boolean { + return contentType.includes('text/markdown') +} + +function isSupportedContentType(contentType: string): boolean { + return /^(text\/|application\/(json|[^;\s/]+\+json|xhtml\+xml|xml|rss\+xml|atom\+xml)\b)/i.test( + contentType, + ) +} + +function extractTextByContentType( + contentType: string, + body: string, +): { + title?: string + description?: string + text: string +} { + const lowerContentType = contentType.toLowerCase() + + if ( + lowerContentType.includes('text/html') || + lowerContentType.includes('application/xhtml') + ) { + return extractHtml(body) + } + + if (isJsonContentType(lowerContentType)) { + try { + return { text: JSON.stringify(JSON.parse(body), null, 2) } + } catch { + return { text: normalizeText(body) } + } + } + + if (isMarkdownContentType(lowerContentType)) { + return extractMarkdownFrontmatter(body) + } + + if ( + lowerContentType.startsWith('text/') || + lowerContentType.includes('application/xml') || + lowerContentType.includes('application/rss+xml') || + lowerContentType.includes('application/atom+xml') + ) { + return { text: normalizeText(body) } + } + + return { text: normalizeText(body) } +} + +function truncateText( + text: string, + maxChars: number, +): { + text: string + truncated: boolean +} { + if (text.length <= maxChars) { + return { text, truncated: false } + } + return { + text: `${text.slice(0, maxChars).trimEnd()}\n\n[Content truncated]`, + truncated: true, + } +} + +export async function readUrl({ + url, + max_chars = DEFAULT_MAX_CHARS, + fetch: fetchImpl = globalThis.fetch, +}: { + url: string + max_chars?: number + fetch?: FetchLike +}): Promise { + let parsedUrl: URL + try { + parsedUrl = new URL(url) + } catch { + return errorResult(url, 'Invalid URL') + } + + if (!isAllowedUrl(parsedUrl)) { + return errorResult(url, 'Only http:// and https:// URLs are supported') + } + + const controller = new AbortController() + const timeout = setTimeout(() => controller.abort(), FETCH_TIMEOUT_MS) + + try { + const response = await fetchImpl(parsedUrl.toString(), { + redirect: 'follow', + signal: controller.signal, + headers: { + accept: + 'text/html,application/xhtml+xml,application/json,text/plain;q=0.9,*/*;q=0.8', + 'accept-language': 'en-US,en;q=0.9', + 'user-agent': USER_AGENT, + }, + }) + + if (!response.ok) { + return errorResult( + url, + `Failed to fetch URL: ${response.status} ${response.statusText}`, + ) + } + + const contentType = getHeader(response.headers, 'content-type') ?? '' + if (contentType && !isSupportedContentType(contentType)) { + return errorResult( + url, + `Unsupported content type: ${contentType || 'unknown'}`, + ) + } + + const body = await readResponseBody(response, MAX_RESPONSE_BYTES) + const extracted = extractTextByContentType(contentType, body) + const truncated = truncateText(extracted.text, max_chars) + + if (!truncated.text) { + return errorResult(url, 'No readable text found at URL') + } + + return [ + { + type: 'json', + value: { + url, + finalUrl: response.url || parsedUrl.toString(), + status: response.status, + ...(contentType ? { contentType } : {}), + ...(extracted.title ? { title: extracted.title } : {}), + ...(extracted.description + ? { description: extracted.description } + : {}), + text: truncated.text, + truncated: truncated.truncated, + }, + }, + ] + } catch (error) { + const isAbort = error instanceof Error && error.name === 'AbortError' + return errorResult( + url, + isAbort + ? `Timed out after ${FETCH_TIMEOUT_MS} ms` + : error instanceof Error + ? error.message + : 'Unknown error', + ) + } finally { + clearTimeout(timeout) + } +}