Skip to content

Commit 2b67206

Browse files
committed
Update styles and add chunking
1 parent 01f5b54 commit 2b67206

File tree

7 files changed

+1512
-77
lines changed

7 files changed

+1512
-77
lines changed
Lines changed: 348 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,348 @@
1+
// Copyright 2025 The Chromium Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style license that can be
3+
// found in the LICENSE file.
4+
5+
import type { TestCase } from '../framework/types.js';
6+
import type { HTMLToMarkdownArgs } from '../../tools/HTMLToMarkdownTool.js';
7+
8+
/**
9+
* Test cases for HTMLToMarkdownTool evaluation
10+
*
11+
* These tests validate:
12+
* - HTML-to-Markdown conversion quality
13+
* - Content extraction and filtering
14+
* - Accessibility tree chunking for large pages
15+
* - Chunk boundary handling (new accessibility-tree strategy)
16+
*/
17+
18+
/**
19+
* Simple stable page - baseline test without chunking
20+
*/
21+
export const simpleArticleTest: TestCase<HTMLToMarkdownArgs> = {
22+
id: 'html-to-markdown-simple-001',
23+
name: 'Extract Simple Article',
24+
description: 'Extract markdown from a simple, well-structured article page without chunking',
25+
url: 'https://en.wikipedia.org/wiki/Markdown',
26+
tool: 'html_to_markdown',
27+
input: {
28+
instruction: 'Convert the main article content to clean, well-formatted Markdown',
29+
reasoning: 'Testing extraction from a stable Wikipedia page with clear structure'
30+
},
31+
validation: {
32+
type: 'llm-judge',
33+
llmJudge: {
34+
criteria: [
35+
'Markdown output is well-formatted and readable',
36+
'Main article content is preserved completely',
37+
'Navigation and ads are removed',
38+
'Heading hierarchy (H1, H2, H3) is maintained',
39+
'Links are properly formatted as [text](url)',
40+
'Images are formatted as ![alt](src)',
41+
'No HTML artifacts or tags remain',
42+
'Code blocks and formatting are preserved'
43+
],
44+
temperature: 0 // Deterministic evaluation
45+
}
46+
},
47+
metadata: {
48+
tags: ['simple', 'wikipedia', 'stable', 'baseline'],
49+
timeout: 45000,
50+
retries: 2,
51+
flaky: false
52+
}
53+
};
54+
55+
/**
56+
* Large page requiring chunking - PRIMARY CHUNKING TEST
57+
* This is the Wikipedia Australia page (100k+ tokens) that Tyson tested
58+
*/
59+
export const largeArticleChunkingTest: TestCase<HTMLToMarkdownArgs> = {
60+
id: 'html-to-markdown-chunking-001',
61+
name: 'Extract Large Article with Accessibility Tree Chunking',
62+
description: 'Extract markdown from Wikipedia Australia page (100k+ tokens) using new accessibility-tree chunking strategy that splits on [nodeId] boundaries',
63+
url: 'https://en.wikipedia.org/wiki/Australia',
64+
tool: 'html_to_markdown',
65+
input: {
66+
instruction: 'Convert the complete article to Markdown, ensuring all sections are captured without loss at chunk boundaries',
67+
reasoning: 'Testing new accessibility-tree chunking strategy on confirmed 100k+ token page'
68+
},
69+
validation: {
70+
type: 'llm-judge',
71+
llmJudge: {
72+
criteria: [
73+
'All major sections are included (Geography, History, Demographics, Culture, Economy, etc.)',
74+
'No content truncation occurs at chunk boundaries',
75+
'Chunking boundaries respect [nodeId] patterns without splitting mid-node',
76+
'Heading hierarchy is consistent across entire output',
77+
'No duplicate paragraphs from chunk overlaps',
78+
'Cross-references and internal links between sections are preserved',
79+
'Final markdown is coherent and reads as a complete article',
80+
'Section transitions are smooth (no jarring breaks between chunks)',
81+
'Lists and tables that span multiple nodes are complete',
82+
'Images and captions are properly associated'
83+
],
84+
temperature: 0
85+
}
86+
},
87+
metadata: {
88+
tags: ['large', 'chunking', 'accessibility-tree', 'wikipedia', '100k-tokens'],
89+
timeout: 90000, // 90s for chunked processing (13+ LLM calls)
90+
retries: 2,
91+
flaky: false
92+
}
93+
};
94+
95+
/**
96+
* Test at chunking threshold boundary (exactly 10k tokens)
97+
*/
98+
export const chunkingThresholdTest: TestCase<HTMLToMarkdownArgs> = {
99+
id: 'html-to-markdown-threshold-001',
100+
name: 'Test Chunking Threshold Detection',
101+
description: 'Test with page near 10k token threshold to validate chunking trigger logic',
102+
url: 'https://en.wikipedia.org/wiki/History_of_the_Internet',
103+
tool: 'html_to_markdown',
104+
input: {
105+
instruction: 'Extract the complete article ensuring threshold detection works correctly',
106+
reasoning: 'Validating that pages just over 10k tokens trigger chunking appropriately'
107+
},
108+
validation: {
109+
type: 'llm-judge',
110+
llmJudge: {
111+
criteria: [
112+
'Complete article content is extracted',
113+
'All timeline sections are captured',
114+
'Historical events are in chronological order',
115+
'Technical details are preserved',
116+
'Output quality is consistent regardless of chunking decision'
117+
],
118+
temperature: 0
119+
}
120+
},
121+
metadata: {
122+
tags: ['threshold', 'chunking', 'wikipedia', 'boundary-test'],
123+
timeout: 60000,
124+
retries: 2,
125+
flaky: false
126+
}
127+
};
128+
129+
/**
130+
* Complex real-world page with ads, sidebars, and dynamic content
131+
*/
132+
export const complexPageTest: TestCase<HTMLToMarkdownArgs> = {
133+
id: 'html-to-markdown-complex-001',
134+
name: 'Extract Content from Complex Page',
135+
description: 'Extract main content from page with sidebars, ads, navigation, and complex layout',
136+
url: 'https://www.theguardian.com/technology',
137+
tool: 'html_to_markdown',
138+
input: {
139+
instruction: 'Extract the main news articles and headlines, filtering out sidebars, ads, and navigation',
140+
reasoning: 'Testing content filtering on real-world complex page layout'
141+
},
142+
validation: {
143+
type: 'llm-judge',
144+
llmJudge: {
145+
criteria: [
146+
'Main article headlines are extracted correctly',
147+
'Article summaries/previews are included',
148+
'Related articles sidebar is filtered out',
149+
'Advertisement content is completely removed',
150+
'Navigation menus are excluded',
151+
'Recommended content sections are filtered',
152+
'Links to full articles are preserved',
153+
'Bylines and publication dates are captured'
154+
],
155+
temperature: 0
156+
}
157+
},
158+
metadata: {
159+
tags: ['complex', 'real-world', 'filtering', 'dynamic', 'news'],
160+
timeout: 60000,
161+
retries: 3,
162+
flaky: true // News sites have dynamic content
163+
}
164+
};
165+
166+
/**
167+
* Technical documentation page with code blocks
168+
*/
169+
export const technicalDocsTest: TestCase<HTMLToMarkdownArgs> = {
170+
id: 'html-to-markdown-docs-001',
171+
name: 'Extract Technical Documentation',
172+
description: 'Extract documentation with code blocks, API references, and technical content',
173+
url: 'https://developer.mozilla.org/en-US/docs/Web/API/Fetch_API',
174+
tool: 'html_to_markdown',
175+
input: {
176+
instruction: 'Convert the documentation to Markdown preserving code examples and syntax highlighting',
177+
reasoning: 'Testing code block preservation and technical content extraction'
178+
},
179+
validation: {
180+
type: 'llm-judge',
181+
llmJudge: {
182+
criteria: [
183+
'Code blocks are properly formatted with triple backticks',
184+
'Code syntax and indentation is preserved',
185+
'API method signatures are accurate',
186+
'Parameter descriptions are complete',
187+
'Example code is runnable and correct',
188+
'Technical terminology is preserved',
189+
'Navigation breadcrumbs are removed',
190+
'Related API links are included'
191+
],
192+
temperature: 0
193+
}
194+
},
195+
metadata: {
196+
tags: ['technical', 'documentation', 'code-blocks', 'mdn'],
197+
timeout: 45000,
198+
retries: 2,
199+
flaky: false
200+
}
201+
};
202+
203+
/**
204+
* Chunking stress test - extremely large page
205+
*/
206+
export const massiveArticleTest: TestCase<HTMLToMarkdownArgs> = {
207+
id: 'html-to-markdown-massive-001',
208+
name: 'Extract Massive Article (Stress Test)',
209+
description: 'Extract extremely long article to stress-test chunking with 20+ chunks',
210+
url: 'https://en.wikipedia.org/wiki/List_of_countries_by_population',
211+
tool: 'html_to_markdown',
212+
input: {
213+
instruction: 'Extract the complete list maintaining table structure and country data',
214+
reasoning: 'Stress testing chunking system with very large tabular data'
215+
},
216+
validation: {
217+
type: 'llm-judge',
218+
llmJudge: {
219+
criteria: [
220+
'All countries in the list are included',
221+
'Table structure is preserved in markdown format',
222+
'Population data is accurate and aligned',
223+
'No countries are duplicated from chunk boundaries',
224+
'Headers and footers are included once',
225+
'References and notes section is complete'
226+
],
227+
temperature: 0
228+
}
229+
},
230+
metadata: {
231+
tags: ['stress-test', 'massive', 'chunking', 'tables', 'wikipedia'],
232+
timeout: 120000, // 2 minutes for very large content
233+
retries: 2,
234+
flaky: false
235+
}
236+
};
237+
238+
/**
239+
* All HTMLToMarkdownTool test cases
240+
*/
241+
export const htmlToMarkdownTests: TestCase<HTMLToMarkdownArgs>[] = [
242+
simpleArticleTest,
243+
largeArticleChunkingTest,
244+
chunkingThresholdTest,
245+
complexPageTest,
246+
technicalDocsTest,
247+
massiveArticleTest,
248+
];
249+
250+
/**
251+
* Basic tests for quick validation (no chunking)
252+
*/
253+
export const basicHtmlToMarkdownTests: TestCase<HTMLToMarkdownArgs>[] = [
254+
simpleArticleTest,
255+
technicalDocsTest,
256+
];
257+
258+
/**
259+
* Chunking-specific tests
260+
*/
261+
export const chunkingTests: TestCase<HTMLToMarkdownArgs>[] = [
262+
largeArticleChunkingTest,
263+
chunkingThresholdTest,
264+
massiveArticleTest,
265+
];
266+
267+
/**
268+
* Comprehensive test suite including dynamic content
269+
*/
270+
export const comprehensiveHtmlToMarkdownTests: TestCase<HTMLToMarkdownArgs>[] = [
271+
simpleArticleTest,
272+
largeArticleChunkingTest,
273+
chunkingThresholdTest,
274+
complexPageTest,
275+
technicalDocsTest,
276+
];
277+
278+
/**
279+
* Stable tests only (no flaky dynamic content)
280+
*/
281+
export const stableHtmlToMarkdownTests: TestCase<HTMLToMarkdownArgs>[] =
282+
htmlToMarkdownTests.filter(test => !test.metadata.flaky);
283+
284+
/**
285+
* Get a specific test by ID
286+
*/
287+
export function getHtmlToMarkdownTestById(
288+
id: string
289+
): TestCase<HTMLToMarkdownArgs> | undefined {
290+
return htmlToMarkdownTests.find(test => test.id === id);
291+
}
292+
293+
/**
294+
* Get tests by tag
295+
*/
296+
export function getHtmlToMarkdownTestsByTag(
297+
tag: string
298+
): TestCase<HTMLToMarkdownArgs>[] {
299+
return htmlToMarkdownTests.filter(test =>
300+
test.metadata.tags.includes(tag)
301+
);
302+
}
303+
304+
/**
305+
* Get only chunking-related tests
306+
*/
307+
export function getChunkingSpecificTests(): TestCase<HTMLToMarkdownArgs>[] {
308+
return htmlToMarkdownTests.filter(test =>
309+
test.metadata.tags.includes('chunking') ||
310+
test.metadata.tags.includes('large') ||
311+
test.metadata.tags.includes('massive')
312+
);
313+
}
314+
315+
/**
316+
* Get tests by expected duration (for CI optimization)
317+
*/
318+
export function getTestsByDuration(
319+
maxTimeout: number
320+
): TestCase<HTMLToMarkdownArgs>[] {
321+
return htmlToMarkdownTests.filter(test =>
322+
(test.metadata.timeout || 45000) <= maxTimeout
323+
);
324+
}
325+
326+
/**
327+
* CommonJS export for Node.js compatibility
328+
* Allows backend evaluation runner to import test cases
329+
*/
330+
if (typeof module !== 'undefined' && module.exports) {
331+
module.exports = {
332+
simpleArticleTest,
333+
largeArticleChunkingTest,
334+
chunkingThresholdTest,
335+
complexPageTest,
336+
technicalDocsTest,
337+
massiveArticleTest,
338+
htmlToMarkdownTests,
339+
basicHtmlToMarkdownTests,
340+
chunkingTests,
341+
comprehensiveHtmlToMarkdownTests,
342+
stableHtmlToMarkdownTests,
343+
getHtmlToMarkdownTestById,
344+
getHtmlToMarkdownTestsByTag,
345+
getChunkingSpecificTests,
346+
getTestsByDuration,
347+
};
348+
}

front_end/panels/ai_chat/tools/FetcherTool.ts

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,13 @@ export interface FetcherToolResult {
4242
* This agent takes a list of URLs, navigates to each one, and extracts
4343
* the main content as markdown. It uses NavigateURLTool for navigation
4444
* and HTMLToMarkdownTool for content extraction.
45+
*
46+
* Content extraction is handled by HTMLToMarkdownTool, which
47+
* automatically chunks large pages for efficient processing.
4548
*/
4649
export class FetcherTool implements Tool<FetcherToolArgs, FetcherToolResult> {
4750
name = 'fetcher_tool';
48-
description = 'Navigates to URLs, extracts and cleans the main content, returning markdown for each source';
51+
description = 'Navigates to URLs, extracts and cleans the main content, returning markdown for each source.';
4952

5053

5154
schema = {
@@ -124,7 +127,11 @@ export class FetcherTool implements Tool<FetcherToolArgs, FetcherToolResult> {
124127
/**
125128
* Fetch and extract content from a single URL
126129
*/
127-
private async fetchContentFromUrl(url: string, reasoning: string, ctx?: LLMContext): Promise<FetchedContent> {
130+
private async fetchContentFromUrl(
131+
url: string,
132+
reasoning: string,
133+
ctx?: LLMContext
134+
): Promise<FetchedContent> {
128135
const signal = ctx?.abortSignal;
129136
const throwIfAborted = () => {
130137
if (signal?.aborted) {
@@ -201,7 +208,7 @@ export class FetcherTool implements Tool<FetcherToolArgs, FetcherToolResult> {
201208
};
202209
}
203210

204-
// Return the fetched content
211+
// Return the fetched content (HTMLToMarkdownTool handles chunking)
205212
return {
206213
url: metadata?.url || url,
207214
title: metadata?.title || '',

0 commit comments

Comments
 (0)