Skip to content

Commit 10a82aa

Browse files
committed
Add support for readability and more optimization
1 parent 2b67206 commit 10a82aa

File tree

11 files changed

+3182
-104
lines changed

11 files changed

+3182
-104
lines changed

front_end/panels/ai_chat/BUILD.gn

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ devtools_module("ai_chat") {
6969
"ui/ConversationHistoryList.ts",
7070
"ui/conversationHistoryStyles.ts",
7171
"ui/CustomProviderDialog.ts",
72+
"ui/customProviderStyles.ts",
7273
"ai_chat_impl.ts",
7374
"models/ChatTypes.ts",
7475
"persistence/ConversationTypes.ts",
@@ -120,6 +121,7 @@ devtools_module("ai_chat") {
120121
"tools/FinalizeWithCritiqueTool.ts",
121122
"tools/VisitHistoryManager.ts",
122123
"tools/HTMLToMarkdownTool.ts",
124+
"tools/ReadabilityExtractorTool.ts",
123125
"tools/SchemaBasedExtractorTool.ts",
124126
"tools/StreamlinedSchemaExtractorTool.ts",
125127
"tools/CombinedExtractionTool.ts",
@@ -174,6 +176,7 @@ devtools_module("ai_chat") {
174176
"evaluation/test-cases/research-agent-tests.ts",
175177
"evaluation/test-cases/action-agent-tests.ts",
176178
"evaluation/test-cases/web-task-agent-tests.ts",
179+
"evaluation/test-cases/html-to-markdown-tests.ts",
177180
"evaluation/runner/EvaluationRunner.ts",
178181
"evaluation/runner/VisionAgentEvaluationRunner.ts",
179182
"common/MarkdownViewerUtil.ts",
@@ -183,6 +186,8 @@ devtools_module("ai_chat") {
183186
"common/page.ts",
184187
"common/WebSocketRPCClient.ts",
185188
"common/EvaluationConfig.ts",
189+
"utils/ContentChunker.ts",
190+
"vendor/readability-source.ts",
186191
"evaluation/remote/EvaluationProtocol.ts",
187192
"evaluation/remote/EvaluationAgent.ts",
188193
"tracing/TracingProvider.ts",
@@ -319,6 +324,7 @@ _ai_chat_sources = [
319324
"tools/FinalizeWithCritiqueTool.ts",
320325
"tools/VisitHistoryManager.ts",
321326
"tools/HTMLToMarkdownTool.ts",
327+
"tools/ReadabilityExtractorTool.ts",
322328
"tools/SchemaBasedExtractorTool.ts",
323329
"tools/StreamlinedSchemaExtractorTool.ts",
324330
"tools/CombinedExtractionTool.ts",
@@ -373,6 +379,7 @@ _ai_chat_sources = [
373379
"evaluation/test-cases/research-agent-tests.ts",
374380
"evaluation/test-cases/action-agent-tests.ts",
375381
"evaluation/test-cases/web-task-agent-tests.ts",
382+
"evaluation/test-cases/html-to-markdown-tests.ts",
376383
"evaluation/runner/EvaluationRunner.ts",
377384
"evaluation/runner/VisionAgentEvaluationRunner.ts",
378385
"common/MarkdownViewerUtil.ts",
@@ -382,6 +389,8 @@ _ai_chat_sources = [
382389
"common/page.ts",
383390
"common/WebSocketRPCClient.ts",
384391
"common/EvaluationConfig.ts",
392+
"utils/ContentChunker.ts",
393+
"vendor/readability-source.ts",
385394
"evaluation/remote/EvaluationProtocol.ts",
386395
"evaluation/remote/EvaluationAgent.ts",
387396
"tracing/TracingProvider.ts",

front_end/panels/ai_chat/agent_framework/implementation/ConfiguredAgents.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import { NavigateURLTool, PerformActionTool, GetAccessibilityTreeTool, SearchCon
1212
import { UpdateTodoTool } from '../../tools/UpdateTodoTool.js';
1313
import { ExecuteCodeTool } from '../../tools/ExecuteCodeTool.js';
1414
import { HTMLToMarkdownTool } from '../../tools/HTMLToMarkdownTool.js';
15+
import { ReadabilityExtractorTool } from '../../tools/ReadabilityExtractorTool.js';
1516
import { ConfigurableAgentTool, ToolRegistry } from '../ConfigurableAgentTool.js';
1617
import { ThinkingTool } from '../../tools/ThinkingTool.js';
1718
import { registerMCPMetaTools } from '../../mcp/MCPMetaTools.js';
@@ -48,6 +49,7 @@ export function initializeConfiguredAgents(): void {
4849
ToolRegistry.registerToolFactory('search_content', () => new SearchContentTool());
4950
ToolRegistry.registerToolFactory('take_screenshot', () => new TakeScreenshotTool());
5051
ToolRegistry.registerToolFactory('html_to_markdown', () => new HTMLToMarkdownTool());
52+
ToolRegistry.registerToolFactory('readability_extractor', () => new ReadabilityExtractorTool());
5153
ToolRegistry.registerToolFactory('scroll_page', () => new ScrollPageTool());
5254
ToolRegistry.registerToolFactory('wait_for_page_load', () => new WaitTool());
5355
ToolRegistry.registerToolFactory('thinking', () => new ThinkingTool());

front_end/panels/ai_chat/agent_framework/implementation/agents/ResearchAgent.ts

Lines changed: 40 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ export function createResearchAgentConfig(): AgentToolConfig {
4545
## Key Tools
4646
- **navigate_url + fetcher_tool**: Primary research loop
4747
- **extract_data**: Structured data extraction with JSON schema
48-
- **html_to_markdown**: Clean page text extraction
48+
- **readability_extractor**: Fast plain text extraction
4949
- **create_file/update_file/read_file/list_files**: Persist and track findings across iterations
5050
5151
## Quality Standards
@@ -97,7 +97,7 @@ Example for "AI trends in 2025": ai-trends-2025_research.md, ai-trends-2025_sour
9797
'fetcher_tool',
9898
'extract_data',
9999
'node_ids_to_urls',
100-
'html_to_markdown',
100+
'readability_extractor',
101101
'create_file',
102102
'update_file',
103103
'read_file',
@@ -173,8 +173,8 @@ ${args.scope ? `The scope of research expected: ${args.scope}` : ''}
173173
// Only save successful fetches with content
174174
if (source.success && source.markdownContent && source.markdownContent.trim().length > 0) {
175175
try {
176-
// Create a sanitized filename from the URL
177-
const filename = sanitizeUrlToFilename(source.url);
176+
// Create a sanitized filename from the URL and title
177+
const filename = sanitizeUrlToFilename(source.url, source.title);
178178

179179
// Create file content with metadata header
180180
const fileContent = `# ${source.title || 'Untitled'}
@@ -232,31 +232,46 @@ ${source.markdownContent}`;
232232
}
233233

234234
/**
235-
* Sanitize a URL to create a safe filename
235+
* Sanitize a URL and optional title to create a safe filename
236+
* Prefers title-based names for readability, falls back to URL-based names
236237
*/
237-
function sanitizeUrlToFilename(url: string): string {
238+
function sanitizeUrlToFilename(url: string, title?: string): string {
238239
try {
239-
const urlObj = new URL(url);
240-
241-
// Extract domain and path
242-
let domain = urlObj.hostname.replace(/^www\./, '');
243-
let path = urlObj.pathname.replace(/^\//, '').replace(/\/$/, '');
244-
245-
// Create a base name from domain and path
246-
let baseName = domain;
247-
if (path) {
248-
// Take first 2 path segments for readability
249-
const pathParts = path.split('/').filter(p => p.length > 0);
250-
if (pathParts.length > 0) {
251-
baseName += '-' + pathParts.slice(0, 2).join('-');
252-
}
240+
let baseName = '';
241+
242+
// Prefer title if available
243+
if (title && title.trim()) {
244+
baseName = title
245+
.trim()
246+
.toLowerCase()
247+
.replace(/[^a-zA-Z0-9\s-]/g, '') // Remove special characters
248+
.replace(/\s+/g, '-') // Convert spaces to dashes
249+
.replace(/-+/g, '-') // Collapse multiple dashes
250+
.replace(/^-|-$/g, '') // Remove leading/trailing dashes
251+
.substring(0, 60); // Limit length for readability
253252
}
254253

255-
// Remove special characters and limit length
256-
baseName = baseName
257-
.replace(/[^a-zA-Z0-9-_]/g, '-')
258-
.replace(/-+/g, '-')
259-
.substring(0, 80);
254+
// Fallback to URL-based name if no title or title is empty after sanitization
255+
if (!baseName) {
256+
const urlObj = new URL(url);
257+
let domain = urlObj.hostname.replace(/^www\./, '');
258+
let path = urlObj.pathname.replace(/^\//, '').replace(/\/$/, '');
259+
260+
baseName = domain;
261+
if (path) {
262+
// Take first 2 path segments for readability
263+
const pathParts = path.split('/').filter(p => p.length > 0);
264+
if (pathParts.length > 0) {
265+
baseName += '-' + pathParts.slice(0, 2).join('-');
266+
}
267+
}
268+
269+
// Remove special characters and limit length
270+
baseName = baseName
271+
.replace(/[^a-zA-Z0-9-_]/g, '-')
272+
.replace(/-+/g, '-')
273+
.substring(0, 60);
274+
}
260275

261276
// Add a short hash of the full URL to prevent collisions
262277
const hash = simpleHash(url).substring(0, 8);

front_end/panels/ai_chat/agent_framework/implementation/agents/SearchAgent.ts

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ export function createSearchAgentConfig(): AgentToolConfig {
3232
3. **Collect leads**:
3333
- Use navigate_url to reach the most relevant search entry point (search engines, directories, LinkedIn public results, company pages, press releases).
3434
- Use extract_data with an explicit JSON schema every time you capture structured search results. Prefer capturing multiple leads in one call.
35-
- Batch follow-up pages with fetcher_tool, and use html_to_markdown when you need to confirm context inside long documents.
35+
- Batch follow-up pages with fetcher_tool, and use readability_extractor when you need to confirm context inside long documents.
3636
- After each significant batch of new leads or fetcher_tool response, immediately persist the harvested candidates (including query, timestamp, and confidence notes) by appending to a coordination file via 'create_file'/'update_file'. This keeps other subtasks aligned and prevents redundant scraping.
3737
4. **Mandatory Pagination Loop (ENFORCED)**:
3838
- Harvest target per task: collect 30–50 unique candidates before enrichment (unless the user specifies otherwise). Absolute minimum 25 when the request requires it.
@@ -58,7 +58,7 @@ export function createSearchAgentConfig(): AgentToolConfig {
5858
"name": "extract_data",
5959
"arguments": "{\"instruction\":\"From the currently loaded Google News results page for query 'OpenAI September 2025 news', extract the top 15 news items visible in the search results. For each item extract: title (string), snippet (string), url (string, format:url), source (string), and publishDate (string). Return a JSON object with property 'results' which is an array of these items.\",\"reasoning\":\"Collect structured list of recent news articles about OpenAI in September 2025 so we can batch-fetch the full content for comprehensive research.\",\"schema\":{\"type\":\"object\",\"properties\":{\"results\":{\"type\":\"array\",\"items\":{\"type\":\"object\",\"properties\":{\"title\":{\"type\":\"string\"},\"snippet\":{\"type\":\"string\"},\"url\":{\"type\":\"string\",\"format\":\"url\"},\"source\":{\"type\":\"string\"},\"publishDate\":{\"type\":\"string\"}},\"required\":[\"title\",\"url\",\"source\"]}}},\"required\":[\"results\"]}}"
6060
})
61-
- Use html_to_markdown when you need high-quality page text in addition to (not instead of) structured extractions.
61+
- Use readability_extractor when you need fast plain text extraction in addition to (not instead of) structured extractions.
6262
- Never call extract_data or fetcher_tool without a clear plan for how the results will fill gaps in the objective.
6363
- Before starting new queries, call 'list_files'/'read_file' to review previous batches and avoid duplicating work; always append incremental findings to the existing coordination file for the current objective.
6464
@@ -132,7 +132,7 @@ If you absolutely cannot find any reliable leads, return status "failed" with ga
132132
'extract_data',
133133
'scroll_page',
134134
'action_agent',
135-
'html_to_markdown',
135+
'readability_extractor',
136136
'create_file',
137137
'update_file',
138138
'delete_file',
@@ -273,7 +273,7 @@ If you absolutely cannot find any reliable leads, return status "failed" with ga
273273
],
274274
next_actions: [
275275
'Continue pagination on current queries (Next/numeric page or query params).',
276-
'Batch fetcher_tool on shortlisted URLs; use html_to_markdown + document_search to extract location, availability, portfolio, and contact.',
276+
'Batch fetcher_tool on shortlisted URLs; use readability_extractor + document_search to extract location, availability, portfolio, and contact.',
277277
'Deduplicate by normalized name + hostname and canonical URL.'
278278
]
279279
};

front_end/panels/ai_chat/evaluation/test-cases/html-to-markdown-tests.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -327,7 +327,9 @@ export function getTestsByDuration(
327327
* CommonJS export for Node.js compatibility
328328
* Allows backend evaluation runner to import test cases
329329
*/
330+
// @ts-ignore - module is not defined in browser context
330331
if (typeof module !== 'undefined' && module.exports) {
332+
// @ts-ignore
331333
module.exports = {
332334
simpleArticleTest,
333335
largeArticleChunkingTest,

front_end/panels/ai_chat/tools/FetcherTool.ts

Lines changed: 17 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
// found in the LICENSE file.
44

55
import { createLogger } from '../core/Logger.js';
6-
import { HTMLToMarkdownTool, type HTMLToMarkdownResult } from './HTMLToMarkdownTool.js';
6+
import { ReadabilityExtractorTool, type ReadabilityExtractorResult } from './ReadabilityExtractorTool.js';
77
import { NavigateURLTool, type Tool, type LLMContext } from './Tools.js';
88

99
const logger = createLogger('Tool:Fetcher');
@@ -14,7 +14,7 @@ const logger = createLogger('Tool:Fetcher');
1414
export interface FetchedContent {
1515
url: string;
1616
title: string;
17-
markdownContent: string;
17+
markdownContent: string; // Plain text content (for backwards compatibility, named markdownContent)
1818
success: boolean;
1919
error?: string;
2020
}
@@ -40,15 +40,15 @@ export interface FetcherToolResult {
4040
* Agent that fetches and extracts content from URLs
4141
*
4242
* This agent takes a list of URLs, navigates to each one, and extracts
43-
* the main content as markdown. It uses NavigateURLTool for navigation
44-
* and HTMLToMarkdownTool for content extraction.
43+
* the main content as plain text. It uses NavigateURLTool for navigation
44+
* and ReadabilityExtractorTool for fast content extraction.
4545
*
46-
* Content extraction is handled by HTMLToMarkdownTool, which
47-
* automatically chunks large pages for efficient processing.
46+
* Content extraction is handled by ReadabilityExtractorTool, which uses
47+
* Mozilla Readability for deterministic extraction without LLM calls.
4848
*/
4949
export class FetcherTool implements Tool<FetcherToolArgs, FetcherToolResult> {
5050
name = 'fetcher_tool';
51-
description = 'Navigates to URLs, extracts and cleans the main content, returning markdown for each source.';
51+
description = 'Navigates to URLs, extracts and cleans the main content, returning plain text for each source';
5252

5353

5454
schema = {
@@ -70,7 +70,7 @@ export class FetcherTool implements Tool<FetcherToolArgs, FetcherToolResult> {
7070
};
7171

7272
private navigateURLTool = new NavigateURLTool();
73-
private htmlToMarkdownTool = new HTMLToMarkdownTool();
73+
private readabilityExtractorTool = new ReadabilityExtractorTool();
7474

7575
/**
7676
* Execute the fetcher agent to process multiple URLs
@@ -138,29 +138,6 @@ export class FetcherTool implements Tool<FetcherToolArgs, FetcherToolResult> {
138138
throw new DOMException('The operation was aborted', 'AbortError');
139139
}
140140
};
141-
const sleep = (ms: number) => new Promise<void>((resolve, reject) => {
142-
if (!ms) return resolve();
143-
const timer = setTimeout(() => {
144-
cleanup();
145-
resolve();
146-
}, ms);
147-
const onAbort = () => {
148-
clearTimeout(timer);
149-
cleanup();
150-
reject(new DOMException('The operation was aborted', 'AbortError'));
151-
};
152-
const cleanup = () => {
153-
signal?.removeEventListener('abort', onAbort);
154-
};
155-
if (signal) {
156-
if (signal.aborted) {
157-
clearTimeout(timer);
158-
cleanup();
159-
return reject(new DOMException('The operation was aborted', 'AbortError'));
160-
}
161-
signal.addEventListener('abort', onAbort, { once: true });
162-
}
163-
});
164141
try {
165142
// Step 1: Navigate to the URL
166143
logger.info('Navigating to URL', { url });
@@ -182,37 +159,34 @@ export class FetcherTool implements Tool<FetcherToolArgs, FetcherToolResult> {
182159
};
183160
}
184161

185-
// Wait for 1 second to ensure the page has time to load
186-
await sleep(1000);
187-
throwIfAborted();
188-
189162
// Get metadata from navigation result
190163
const metadata = navigationResult.metadata ? navigationResult.metadata : { url: '', title: '' };
191164

192-
// Step 2: Extract markdown content using HTMLToMarkdownTool
165+
// Step 2: Extract content using ReadabilityExtractorTool (with automatic LLM fallback)
193166
logger.info('Extracting content from URL', { url });
194167
throwIfAborted();
195-
const extractionResult = await this.htmlToMarkdownTool.execute({
196-
instruction: 'Extract the main content focusing on article text, headings, and important information. Remove ads, navigation, and distracting elements.',
168+
169+
// Always pass ctx for LLM fallback capability
170+
const extractionResult = await this.readabilityExtractorTool.execute({
197171
reasoning
198172
}, ctx);
199173

200174
// Check for extraction errors
201-
if (!extractionResult.success || !extractionResult.markdownContent) {
175+
if (!extractionResult.success || !extractionResult.textContent) {
202176
return {
203177
url,
204-
title: metadata?.title || '',
178+
title: metadata?.title || extractionResult.title || '',
205179
markdownContent: '',
206180
success: false,
207181
error: extractionResult.error || 'Failed to extract content'
208182
};
209183
}
210184

211-
// Return the fetched content (HTMLToMarkdownTool handles chunking)
185+
// Return the fetched content (plain text from Readability)
212186
return {
213187
url: metadata?.url || url,
214-
title: metadata?.title || '',
215-
markdownContent: extractionResult.markdownContent,
188+
title: extractionResult.title || metadata?.title || '',
189+
markdownContent: extractionResult.textContent, // Plain text content
216190
success: true
217191
};
218192
} catch (error: any) {

front_end/panels/ai_chat/tools/HTMLToMarkdownTool.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,8 @@ export interface HTMLToMarkdownArgs {
3636
*/
3737
export class HTMLToMarkdownTool implements Tool<HTMLToMarkdownArgs, HTMLToMarkdownResult> {
3838
// Chunking configuration
39-
private readonly TOKEN_LIMIT_FOR_CHUNKING = 10000; // Auto-chunk if tree exceeds this (40k chars)
40-
private readonly CHUNK_TOKEN_LIMIT = 8000; // Max tokens per chunk (32k chars)
39+
private readonly TOKEN_LIMIT_FOR_CHUNKING = 65000; // Auto-chunk if tree exceeds this (~260k chars)
40+
private readonly CHUNK_TOKEN_LIMIT = 40000; // Max tokens per chunk (~160k chars)
4141
private readonly CHARS_PER_TOKEN = 4; // Conservative estimate
4242

4343
private contentChunker = new ContentChunker();

0 commit comments

Comments
 (0)