|
| 1 | +// Copyright 2025 The Chromium Authors. All rights reserved. |
| 2 | +// Use of this source code is governed by a BSD-style license that can be |
| 3 | +// found in the LICENSE file. |
| 4 | + |
| 5 | +import type { TestCase } from '../framework/types.js'; |
| 6 | +import type { HTMLToMarkdownArgs } from '../../tools/HTMLToMarkdownTool.js'; |
| 7 | + |
| 8 | +/** |
| 9 | + * Test cases for HTMLToMarkdownTool evaluation |
| 10 | + * |
| 11 | + * These tests validate: |
| 12 | + * - HTML-to-Markdown conversion quality |
| 13 | + * - Content extraction and filtering |
| 14 | + * - Accessibility tree chunking for large pages |
| 15 | + * - Chunk boundary handling (new accessibility-tree strategy) |
| 16 | + */ |
| 17 | + |
| 18 | +/** |
| 19 | + * Simple stable page - baseline test without chunking |
| 20 | + */ |
| 21 | +export const simpleArticleTest: TestCase<HTMLToMarkdownArgs> = { |
| 22 | + id: 'html-to-markdown-simple-001', |
| 23 | + name: 'Extract Simple Article', |
| 24 | + description: 'Extract markdown from a simple, well-structured article page without chunking', |
| 25 | + url: 'https://en.wikipedia.org/wiki/Markdown', |
| 26 | + tool: 'html_to_markdown', |
| 27 | + input: { |
| 28 | + instruction: 'Convert the main article content to clean, well-formatted Markdown', |
| 29 | + reasoning: 'Testing extraction from a stable Wikipedia page with clear structure' |
| 30 | + }, |
| 31 | + validation: { |
| 32 | + type: 'llm-judge', |
| 33 | + llmJudge: { |
| 34 | + criteria: [ |
| 35 | + 'Markdown output is well-formatted and readable', |
| 36 | + 'Main article content is preserved completely', |
| 37 | + 'Navigation and ads are removed', |
| 38 | + 'Heading hierarchy (H1, H2, H3) is maintained', |
| 39 | + 'Links are properly formatted as [text](url)', |
| 40 | + 'Images are formatted as ', |
| 41 | + 'No HTML artifacts or tags remain', |
| 42 | + 'Code blocks and formatting are preserved' |
| 43 | + ], |
| 44 | + temperature: 0 // Deterministic evaluation |
| 45 | + } |
| 46 | + }, |
| 47 | + metadata: { |
| 48 | + tags: ['simple', 'wikipedia', 'stable', 'baseline'], |
| 49 | + timeout: 45000, |
| 50 | + retries: 2, |
| 51 | + flaky: false |
| 52 | + } |
| 53 | +}; |
| 54 | + |
| 55 | +/** |
| 56 | + * Large page requiring chunking - PRIMARY CHUNKING TEST |
| 57 | + * This is the Wikipedia Australia page (100k+ tokens) that Tyson tested |
| 58 | + */ |
| 59 | +export const largeArticleChunkingTest: TestCase<HTMLToMarkdownArgs> = { |
| 60 | + id: 'html-to-markdown-chunking-001', |
| 61 | + name: 'Extract Large Article with Accessibility Tree Chunking', |
| 62 | + description: 'Extract markdown from Wikipedia Australia page (100k+ tokens) using new accessibility-tree chunking strategy that splits on [nodeId] boundaries', |
| 63 | + url: 'https://en.wikipedia.org/wiki/Australia', |
| 64 | + tool: 'html_to_markdown', |
| 65 | + input: { |
| 66 | + instruction: 'Convert the complete article to Markdown, ensuring all sections are captured without loss at chunk boundaries', |
| 67 | + reasoning: 'Testing new accessibility-tree chunking strategy on confirmed 100k+ token page' |
| 68 | + }, |
| 69 | + validation: { |
| 70 | + type: 'llm-judge', |
| 71 | + llmJudge: { |
| 72 | + criteria: [ |
| 73 | + 'All major sections are included (Geography, History, Demographics, Culture, Economy, etc.)', |
| 74 | + 'No content truncation occurs at chunk boundaries', |
| 75 | + 'Chunking boundaries respect [nodeId] patterns without splitting mid-node', |
| 76 | + 'Heading hierarchy is consistent across entire output', |
| 77 | + 'No duplicate paragraphs from chunk overlaps', |
| 78 | + 'Cross-references and internal links between sections are preserved', |
| 79 | + 'Final markdown is coherent and reads as a complete article', |
| 80 | + 'Section transitions are smooth (no jarring breaks between chunks)', |
| 81 | + 'Lists and tables that span multiple nodes are complete', |
| 82 | + 'Images and captions are properly associated' |
| 83 | + ], |
| 84 | + temperature: 0 |
| 85 | + } |
| 86 | + }, |
| 87 | + metadata: { |
| 88 | + tags: ['large', 'chunking', 'accessibility-tree', 'wikipedia', '100k-tokens'], |
| 89 | + timeout: 90000, // 90s for chunked processing (13+ LLM calls) |
| 90 | + retries: 2, |
| 91 | + flaky: false |
| 92 | + } |
| 93 | +}; |
| 94 | + |
| 95 | +/** |
| 96 | + * Test at chunking threshold boundary (exactly 10k tokens) |
| 97 | + */ |
| 98 | +export const chunkingThresholdTest: TestCase<HTMLToMarkdownArgs> = { |
| 99 | + id: 'html-to-markdown-threshold-001', |
| 100 | + name: 'Test Chunking Threshold Detection', |
| 101 | + description: 'Test with page near 10k token threshold to validate chunking trigger logic', |
| 102 | + url: 'https://en.wikipedia.org/wiki/History_of_the_Internet', |
| 103 | + tool: 'html_to_markdown', |
| 104 | + input: { |
| 105 | + instruction: 'Extract the complete article ensuring threshold detection works correctly', |
| 106 | + reasoning: 'Validating that pages just over 10k tokens trigger chunking appropriately' |
| 107 | + }, |
| 108 | + validation: { |
| 109 | + type: 'llm-judge', |
| 110 | + llmJudge: { |
| 111 | + criteria: [ |
| 112 | + 'Complete article content is extracted', |
| 113 | + 'All timeline sections are captured', |
| 114 | + 'Historical events are in chronological order', |
| 115 | + 'Technical details are preserved', |
| 116 | + 'Output quality is consistent regardless of chunking decision' |
| 117 | + ], |
| 118 | + temperature: 0 |
| 119 | + } |
| 120 | + }, |
| 121 | + metadata: { |
| 122 | + tags: ['threshold', 'chunking', 'wikipedia', 'boundary-test'], |
| 123 | + timeout: 60000, |
| 124 | + retries: 2, |
| 125 | + flaky: false |
| 126 | + } |
| 127 | +}; |
| 128 | + |
| 129 | +/** |
| 130 | + * Complex real-world page with ads, sidebars, and dynamic content |
| 131 | + */ |
| 132 | +export const complexPageTest: TestCase<HTMLToMarkdownArgs> = { |
| 133 | + id: 'html-to-markdown-complex-001', |
| 134 | + name: 'Extract Content from Complex Page', |
| 135 | + description: 'Extract main content from page with sidebars, ads, navigation, and complex layout', |
| 136 | + url: 'https://www.theguardian.com/technology', |
| 137 | + tool: 'html_to_markdown', |
| 138 | + input: { |
| 139 | + instruction: 'Extract the main news articles and headlines, filtering out sidebars, ads, and navigation', |
| 140 | + reasoning: 'Testing content filtering on real-world complex page layout' |
| 141 | + }, |
| 142 | + validation: { |
| 143 | + type: 'llm-judge', |
| 144 | + llmJudge: { |
| 145 | + criteria: [ |
| 146 | + 'Main article headlines are extracted correctly', |
| 147 | + 'Article summaries/previews are included', |
| 148 | + 'Related articles sidebar is filtered out', |
| 149 | + 'Advertisement content is completely removed', |
| 150 | + 'Navigation menus are excluded', |
| 151 | + 'Recommended content sections are filtered', |
| 152 | + 'Links to full articles are preserved', |
| 153 | + 'Bylines and publication dates are captured' |
| 154 | + ], |
| 155 | + temperature: 0 |
| 156 | + } |
| 157 | + }, |
| 158 | + metadata: { |
| 159 | + tags: ['complex', 'real-world', 'filtering', 'dynamic', 'news'], |
| 160 | + timeout: 60000, |
| 161 | + retries: 3, |
| 162 | + flaky: true // News sites have dynamic content |
| 163 | + } |
| 164 | +}; |
| 165 | + |
| 166 | +/** |
| 167 | + * Technical documentation page with code blocks |
| 168 | + */ |
| 169 | +export const technicalDocsTest: TestCase<HTMLToMarkdownArgs> = { |
| 170 | + id: 'html-to-markdown-docs-001', |
| 171 | + name: 'Extract Technical Documentation', |
| 172 | + description: 'Extract documentation with code blocks, API references, and technical content', |
| 173 | + url: 'https://developer.mozilla.org/en-US/docs/Web/API/Fetch_API', |
| 174 | + tool: 'html_to_markdown', |
| 175 | + input: { |
| 176 | + instruction: 'Convert the documentation to Markdown preserving code examples and syntax highlighting', |
| 177 | + reasoning: 'Testing code block preservation and technical content extraction' |
| 178 | + }, |
| 179 | + validation: { |
| 180 | + type: 'llm-judge', |
| 181 | + llmJudge: { |
| 182 | + criteria: [ |
| 183 | + 'Code blocks are properly formatted with triple backticks', |
| 184 | + 'Code syntax and indentation is preserved', |
| 185 | + 'API method signatures are accurate', |
| 186 | + 'Parameter descriptions are complete', |
| 187 | + 'Example code is runnable and correct', |
| 188 | + 'Technical terminology is preserved', |
| 189 | + 'Navigation breadcrumbs are removed', |
| 190 | + 'Related API links are included' |
| 191 | + ], |
| 192 | + temperature: 0 |
| 193 | + } |
| 194 | + }, |
| 195 | + metadata: { |
| 196 | + tags: ['technical', 'documentation', 'code-blocks', 'mdn'], |
| 197 | + timeout: 45000, |
| 198 | + retries: 2, |
| 199 | + flaky: false |
| 200 | + } |
| 201 | +}; |
| 202 | + |
| 203 | +/** |
| 204 | + * Chunking stress test - extremely large page |
| 205 | + */ |
| 206 | +export const massiveArticleTest: TestCase<HTMLToMarkdownArgs> = { |
| 207 | + id: 'html-to-markdown-massive-001', |
| 208 | + name: 'Extract Massive Article (Stress Test)', |
| 209 | + description: 'Extract extremely long article to stress-test chunking with 20+ chunks', |
| 210 | + url: 'https://en.wikipedia.org/wiki/List_of_countries_by_population', |
| 211 | + tool: 'html_to_markdown', |
| 212 | + input: { |
| 213 | + instruction: 'Extract the complete list maintaining table structure and country data', |
| 214 | + reasoning: 'Stress testing chunking system with very large tabular data' |
| 215 | + }, |
| 216 | + validation: { |
| 217 | + type: 'llm-judge', |
| 218 | + llmJudge: { |
| 219 | + criteria: [ |
| 220 | + 'All countries in the list are included', |
| 221 | + 'Table structure is preserved in markdown format', |
| 222 | + 'Population data is accurate and aligned', |
| 223 | + 'No countries are duplicated from chunk boundaries', |
| 224 | + 'Headers and footers are included once', |
| 225 | + 'References and notes section is complete' |
| 226 | + ], |
| 227 | + temperature: 0 |
| 228 | + } |
| 229 | + }, |
| 230 | + metadata: { |
| 231 | + tags: ['stress-test', 'massive', 'chunking', 'tables', 'wikipedia'], |
| 232 | + timeout: 120000, // 2 minutes for very large content |
| 233 | + retries: 2, |
| 234 | + flaky: false |
| 235 | + } |
| 236 | +}; |
| 237 | + |
| 238 | +/** |
| 239 | + * All HTMLToMarkdownTool test cases |
| 240 | + */ |
| 241 | +export const htmlToMarkdownTests: TestCase<HTMLToMarkdownArgs>[] = [ |
| 242 | + simpleArticleTest, |
| 243 | + largeArticleChunkingTest, |
| 244 | + chunkingThresholdTest, |
| 245 | + complexPageTest, |
| 246 | + technicalDocsTest, |
| 247 | + massiveArticleTest, |
| 248 | +]; |
| 249 | + |
| 250 | +/** |
| 251 | + * Basic tests for quick validation (no chunking) |
| 252 | + */ |
| 253 | +export const basicHtmlToMarkdownTests: TestCase<HTMLToMarkdownArgs>[] = [ |
| 254 | + simpleArticleTest, |
| 255 | + technicalDocsTest, |
| 256 | +]; |
| 257 | + |
| 258 | +/** |
| 259 | + * Chunking-specific tests |
| 260 | + */ |
| 261 | +export const chunkingTests: TestCase<HTMLToMarkdownArgs>[] = [ |
| 262 | + largeArticleChunkingTest, |
| 263 | + chunkingThresholdTest, |
| 264 | + massiveArticleTest, |
| 265 | +]; |
| 266 | + |
| 267 | +/** |
| 268 | + * Comprehensive test suite including dynamic content |
| 269 | + */ |
| 270 | +export const comprehensiveHtmlToMarkdownTests: TestCase<HTMLToMarkdownArgs>[] = [ |
| 271 | + simpleArticleTest, |
| 272 | + largeArticleChunkingTest, |
| 273 | + chunkingThresholdTest, |
| 274 | + complexPageTest, |
| 275 | + technicalDocsTest, |
| 276 | +]; |
| 277 | + |
| 278 | +/** |
| 279 | + * Stable tests only (no flaky dynamic content) |
| 280 | + */ |
| 281 | +export const stableHtmlToMarkdownTests: TestCase<HTMLToMarkdownArgs>[] = |
| 282 | + htmlToMarkdownTests.filter(test => !test.metadata.flaky); |
| 283 | + |
| 284 | +/** |
| 285 | + * Get a specific test by ID |
| 286 | + */ |
| 287 | +export function getHtmlToMarkdownTestById( |
| 288 | + id: string |
| 289 | +): TestCase<HTMLToMarkdownArgs> | undefined { |
| 290 | + return htmlToMarkdownTests.find(test => test.id === id); |
| 291 | +} |
| 292 | + |
| 293 | +/** |
| 294 | + * Get tests by tag |
| 295 | + */ |
| 296 | +export function getHtmlToMarkdownTestsByTag( |
| 297 | + tag: string |
| 298 | +): TestCase<HTMLToMarkdownArgs>[] { |
| 299 | + return htmlToMarkdownTests.filter(test => |
| 300 | + test.metadata.tags.includes(tag) |
| 301 | + ); |
| 302 | +} |
| 303 | + |
| 304 | +/** |
| 305 | + * Get only chunking-related tests |
| 306 | + */ |
| 307 | +export function getChunkingSpecificTests(): TestCase<HTMLToMarkdownArgs>[] { |
| 308 | + return htmlToMarkdownTests.filter(test => |
| 309 | + test.metadata.tags.includes('chunking') || |
| 310 | + test.metadata.tags.includes('large') || |
| 311 | + test.metadata.tags.includes('massive') |
| 312 | + ); |
| 313 | +} |
| 314 | + |
| 315 | +/** |
| 316 | + * Get tests by expected duration (for CI optimization) |
| 317 | + */ |
| 318 | +export function getTestsByDuration( |
| 319 | + maxTimeout: number |
| 320 | +): TestCase<HTMLToMarkdownArgs>[] { |
| 321 | + return htmlToMarkdownTests.filter(test => |
| 322 | + (test.metadata.timeout || 45000) <= maxTimeout |
| 323 | + ); |
| 324 | +} |
| 325 | + |
| 326 | +/** |
| 327 | + * CommonJS export for Node.js compatibility |
| 328 | + * Allows backend evaluation runner to import test cases |
| 329 | + */ |
| 330 | +if (typeof module !== 'undefined' && module.exports) { |
| 331 | + module.exports = { |
| 332 | + simpleArticleTest, |
| 333 | + largeArticleChunkingTest, |
| 334 | + chunkingThresholdTest, |
| 335 | + complexPageTest, |
| 336 | + technicalDocsTest, |
| 337 | + massiveArticleTest, |
| 338 | + htmlToMarkdownTests, |
| 339 | + basicHtmlToMarkdownTests, |
| 340 | + chunkingTests, |
| 341 | + comprehensiveHtmlToMarkdownTests, |
| 342 | + stableHtmlToMarkdownTests, |
| 343 | + getHtmlToMarkdownTestById, |
| 344 | + getHtmlToMarkdownTestsByTag, |
| 345 | + getChunkingSpecificTests, |
| 346 | + getTestsByDuration, |
| 347 | + }; |
| 348 | +} |
0 commit comments