From eedd127bf4f27f34f686127fdd26d4ade7499e55 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Fri, 17 Apr 2026 13:28:59 +0000 Subject: [PATCH 1/3] feat(core): auto-discover test cases from directory structure (#1141) When `tests:` points to a directory, scan subdirectories for `case.yaml` files. Directory name becomes the test `id` unless overridden. A `workspace/` subdirectory auto-sets the workspace template. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../docs/docs/evaluation/eval-files.mdx | 42 +++- .../showcase/directory-discovery/EVAL.yaml | 4 + .../cases/add-greeting/case.yaml | 4 + .../cases/fix-null-check/case.yaml | 5 + .../evaluation/loaders/case-file-loader.ts | 82 ++++++- packages/core/src/evaluation/yaml-parser.ts | 24 +- .../loaders/case-file-loader.test.ts | 228 ++++++++++++++++++ 7 files changed, 382 insertions(+), 7 deletions(-) create mode 100644 examples/showcase/directory-discovery/EVAL.yaml create mode 100644 examples/showcase/directory-discovery/cases/add-greeting/case.yaml create mode 100644 examples/showcase/directory-discovery/cases/fix-null-check/case.yaml diff --git a/apps/web/src/content/docs/docs/evaluation/eval-files.mdx b/apps/web/src/content/docs/docs/evaluation/eval-files.mdx index 9b68cfaf1..dac72a078 100644 --- a/apps/web/src/content/docs/docs/evaluation/eval-files.mdx +++ b/apps/web/src/content/docs/docs/evaluation/eval-files.mdx @@ -40,7 +40,7 @@ tests: | `suite` | Optional suite identifier | | `execution` | Default execution config (`target`, `fail_on_error`, `threshold`, etc.) | | `workspace` | Suite-level workspace config — inline object or string path to an [external workspace file](/docs/guides/workspace-pool/#external-workspace-config) | -| `tests` | Array of individual tests, or a string path to an external file | +| `tests` | Array of individual tests, or a string path to an external file or directory | | `assertions` | Suite-level graders appended to each test unless `execution.skip_defaults: true` is set on the test | | `input` | Suite-level input messages prepended to each test's input unless `execution.skip_defaults: true` is set on the test | @@ -178,6 +178,46 @@ tests: ./cases.yaml The path is resolved relative to the eval file's directory. The external file should contain a YAML array of test objects or a JSONL file with one test per line. +### Tests as Directory Path + +When `tests` points to a directory, AgentV auto-discovers test cases from subdirectories. Each subdirectory containing a `case.yaml` (or `case.yml`) becomes a test case: + +``` +my-eval/ + EVAL.yaml + cases/ + fix-null-check/ + case.yaml + add-greeting/ + case.yaml + workspace/ # optional per-case workspace template + setup-files... +``` + +```yaml +# EVAL.yaml +name: my-benchmark +tests: ./cases/ +``` + +Each `case.yaml` is a single YAML object (not an array) with the same fields as an inline test: + +```yaml +# cases/fix-null-check/case.yaml +criteria: Fixes the null reference bug in the parser module +input: Fix the null check bug in parser.ts +``` + +**Behavior:** + +- **Directory name as `id`:** If `case.yaml` doesn't specify an `id`, the directory name is used (e.g., `fix-null-check`) +- **Alphabetical ordering:** Subdirectories are sorted alphabetically for deterministic order +- **Per-case workspace:** A `workspace/` subdirectory inside the case directory automatically sets `workspace.template` to that path, unless the case already defines a `workspace` field +- **Skipped directories:** Subdirectories without `case.yaml` are skipped with a warning +- **Suite-level config applies:** Suite-level `assertions`, `input`, `workspace`, and `execution` still apply to directory-discovered cases + +This pattern is useful for benchmarks with many cases, where each case benefits from its own directory for workspace templates, supporting files, or documentation. + ## Environment Variable Interpolation All string fields in eval files support `${{ VAR }}` syntax for environment variable interpolation. This enables portable eval configs that work across machines and CI environments without hardcoded paths. diff --git a/examples/showcase/directory-discovery/EVAL.yaml b/examples/showcase/directory-discovery/EVAL.yaml new file mode 100644 index 000000000..193b8a2dd --- /dev/null +++ b/examples/showcase/directory-discovery/EVAL.yaml @@ -0,0 +1,4 @@ +name: directory-discovery +description: Demonstrates auto-discovering test cases from a directory structure + +tests: ./cases/ diff --git a/examples/showcase/directory-discovery/cases/add-greeting/case.yaml b/examples/showcase/directory-discovery/cases/add-greeting/case.yaml new file mode 100644 index 000000000..95c57b6b3 --- /dev/null +++ b/examples/showcase/directory-discovery/cases/add-greeting/case.yaml @@ -0,0 +1,4 @@ +criteria: Adds a greeting message that displays the user's name +input: | + Add a greeting feature to the homepage. When a user logs in, + display "Welcome back, {name}!" at the top of the page. diff --git a/examples/showcase/directory-discovery/cases/fix-null-check/case.yaml b/examples/showcase/directory-discovery/cases/fix-null-check/case.yaml new file mode 100644 index 000000000..e57f874d7 --- /dev/null +++ b/examples/showcase/directory-discovery/cases/fix-null-check/case.yaml @@ -0,0 +1,5 @@ +criteria: Identifies and fixes the null reference bug in the parser module +input: | + Fix the null check bug in parser.ts. The function `parseToken` crashes + when given an empty string because it doesn't check for null before + accessing `.length`. diff --git a/packages/core/src/evaluation/loaders/case-file-loader.ts b/packages/core/src/evaluation/loaders/case-file-loader.ts index e40b92625..7beebd09f 100644 --- a/packages/core/src/evaluation/loaders/case-file-loader.ts +++ b/packages/core/src/evaluation/loaders/case-file-loader.ts @@ -1,4 +1,4 @@ -import { readFile } from 'node:fs/promises'; +import { readFile, readdir, stat } from 'node:fs/promises'; import path from 'node:path'; import fg from 'fast-glob'; import { parse as parseYaml } from 'yaml'; @@ -158,6 +158,86 @@ export async function resolveFileReference( return loadCasesFromFile(absolutePattern); } +/** + * Load test cases from a directory structure. + * Scans immediate subdirectories for case.yaml/case.yml files. + * Each subdirectory becomes a test case, with the directory name used as `id` + * if the case file doesn't specify one. A `workspace/` subdirectory in the + * case directory sets the workspace template automatically. + */ +export async function loadCasesFromDirectory(dirPath: string): Promise { + const entries = await readdir(dirPath, { withFileTypes: true }); + const subdirs = entries.filter((e) => e.isDirectory()).sort((a, b) => a.name.localeCompare(b.name)); + + const results: JsonObject[] = []; + for (const subdir of subdirs) { + const subdirPath = path.join(dirPath, subdir.name); + + // Look for case.yaml or case.yml + let caseFilePath: string | undefined; + for (const filename of ['case.yaml', 'case.yml']) { + const candidate = path.join(subdirPath, filename); + try { + const s = await stat(candidate); + if (s.isFile()) { + caseFilePath = candidate; + break; + } + } catch { + // File doesn't exist, try next + } + } + + if (!caseFilePath) { + console.warn( + `${ANSI_YELLOW}Warning: Skipping directory '${subdir.name}' — no case.yaml found${ANSI_RESET}`, + ); + continue; + } + + // Parse case.yaml as a single object (not array) + let content: string; + try { + content = await readFile(caseFilePath, 'utf8'); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + throw new Error(`Cannot read case file: ${caseFilePath}\n ${message}`); + } + + const raw = parseYaml(content) as unknown; + const parsed = interpolateEnv(raw, process.env); + if (!isJsonObject(parsed)) { + throw new Error( + `Case file must contain a YAML object, got ${typeof parsed}: ${caseFilePath}`, + ); + } + + const caseObj = { ...parsed }; + + // Inject id from directory name if not specified + if (!caseObj.id) { + caseObj.id = subdir.name; + } + + // Check for workspace/ subdirectory + if (!caseObj.workspace) { + const workspaceDirPath = path.join(subdirPath, 'workspace'); + try { + const s = await stat(workspaceDirPath); + if (s.isDirectory()) { + caseObj.workspace = { template: workspaceDirPath }; + } + } catch { + // No workspace directory, that's fine + } + } + + results.push(caseObj); + } + + return results; +} + /** * Process a tests array, expanding any file:// references into inline test objects. * Returns a flat array of JsonValue where all file:// strings are replaced diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts index 4d8cf057d..88ae9bc47 100644 --- a/packages/core/src/evaluation/yaml-parser.ts +++ b/packages/core/src/evaluation/yaml-parser.ts @@ -1,4 +1,4 @@ -import { readFile } from 'node:fs/promises'; +import { readFile, stat } from 'node:fs/promises'; import path from 'node:path'; import micromatch from 'micromatch'; import { parse } from 'yaml'; @@ -6,7 +6,11 @@ import { parse } from 'yaml'; import { collectResolvedInputFilePaths } from './input-message-utils.js'; import { interpolateEnv } from './interpolation.js'; import { loadTestsFromAgentSkills } from './loaders/agent-skills-parser.js'; -import { expandFileReferences, loadCasesFromFile } from './loaders/case-file-loader.js'; +import { + expandFileReferences, + loadCasesFromDirectory, + loadCasesFromFile, +} from './loaders/case-file-loader.js'; import { extractBudgetUsd, extractCacheConfig, @@ -332,12 +336,22 @@ async function loadTestsFromYaml( // Parse suite-level workspace config (default for all cases) const evalFileDir = path.dirname(absoluteTestPath); - // Resolve tests: string path to external file, inline array, or error + // Resolve tests: string path to external file/directory, inline array, or error let expandedTestCases: readonly JsonValue[]; if (typeof rawTestCases === 'string') { - // String path: load tests from external file (YAML, JSONL) const externalPath = path.resolve(evalFileDir, rawTestCases); - expandedTestCases = await loadCasesFromFile(externalPath); + let isDir = false; + try { + const pathStat = await stat(externalPath); + isDir = pathStat.isDirectory(); + } catch { + // Path doesn't exist — fall through to loadCasesFromFile for its error message + } + if (isDir) { + expandedTestCases = await loadCasesFromDirectory(externalPath); + } else { + expandedTestCases = await loadCasesFromFile(externalPath); + } } else if (Array.isArray(rawTestCases)) { // Inline array: expand any file:// references expandedTestCases = await expandFileReferences(rawTestCases, evalFileDir); diff --git a/packages/core/test/evaluation/loaders/case-file-loader.test.ts b/packages/core/test/evaluation/loaders/case-file-loader.test.ts index a51e65e99..d580745f5 100644 --- a/packages/core/test/evaluation/loaders/case-file-loader.test.ts +++ b/packages/core/test/evaluation/loaders/case-file-loader.test.ts @@ -6,6 +6,7 @@ import path from 'node:path'; import { expandFileReferences, isFileReference, + loadCasesFromDirectory, resolveFileReference, } from '../../../src/evaluation/loaders/case-file-loader.js'; import { loadTestSuite, loadTests } from '../../../src/evaluation/yaml-parser.js'; @@ -442,3 +443,230 @@ tests: bare-cases.yaml expect(tests[0].id).toBe('bare-path-test'); }); }); + +describe('loadCasesFromDirectory', () => { + let tempDir: string; + + beforeAll(async () => { + tempDir = path.join(os.tmpdir(), `agentv-dir-discovery-${Date.now()}`); + await mkdir(tempDir, { recursive: true }); + }); + + afterAll(async () => { + await rm(tempDir, { recursive: true, force: true }); + }); + + it('discovers cases from subdirectories with case.yaml', async () => { + const casesDir = path.join(tempDir, 'happy-path'); + await mkdir(path.join(casesDir, 'fix-bug'), { recursive: true }); + await mkdir(path.join(casesDir, 'add-feature'), { recursive: true }); + + await writeFile( + path.join(casesDir, 'fix-bug', 'case.yaml'), + `criteria: "Fixes the null check bug" +input: "Fix the null check" +`, + ); + await writeFile( + path.join(casesDir, 'add-feature', 'case.yaml'), + `criteria: "Adds greeting feature" +input: "Add a greeting" +`, + ); + + const cases = await loadCasesFromDirectory(casesDir); + + expect(cases).toHaveLength(2); + // Alphabetical order: add-feature before fix-bug + expect(cases[0].id).toBe('add-feature'); + expect(cases[0].criteria).toBe('Adds greeting feature'); + expect(cases[1].id).toBe('fix-bug'); + expect(cases[1].criteria).toBe('Fixes the null check bug'); + }); + + it('uses directory name as id when not specified in case.yaml', async () => { + const casesDir = path.join(tempDir, 'no-id'); + await mkdir(path.join(casesDir, 'my-case'), { recursive: true }); + + await writeFile( + path.join(casesDir, 'my-case', 'case.yaml'), + `criteria: "Some goal" +input: "Do something" +`, + ); + + const cases = await loadCasesFromDirectory(casesDir); + + expect(cases).toHaveLength(1); + expect(cases[0].id).toBe('my-case'); + }); + + it('id in case.yaml takes precedence over directory name', async () => { + const casesDir = path.join(tempDir, 'explicit-id'); + await mkdir(path.join(casesDir, 'dir-name'), { recursive: true }); + + await writeFile( + path.join(casesDir, 'dir-name', 'case.yaml'), + `id: custom-id +criteria: "Some goal" +input: "Do something" +`, + ); + + const cases = await loadCasesFromDirectory(casesDir); + + expect(cases).toHaveLength(1); + expect(cases[0].id).toBe('custom-id'); + }); + + it('skips subdirectories without case.yaml with warning', async () => { + const casesDir = path.join(tempDir, 'skip-warning'); + await mkdir(path.join(casesDir, 'has-case'), { recursive: true }); + await mkdir(path.join(casesDir, 'no-case'), { recursive: true }); + + await writeFile( + path.join(casesDir, 'has-case', 'case.yaml'), + `criteria: "Goal" +input: "Input" +`, + ); + // no-case directory has no case.yaml + + const cases = await loadCasesFromDirectory(casesDir); + + expect(cases).toHaveLength(1); + expect(cases[0].id).toBe('has-case'); + }); + + it('returns cases in alphabetical order', async () => { + const casesDir = path.join(tempDir, 'alpha-order'); + for (const name of ['charlie', 'alpha', 'bravo']) { + await mkdir(path.join(casesDir, name), { recursive: true }); + await writeFile( + path.join(casesDir, name, 'case.yaml'), + `criteria: "${name}" +input: "${name}" +`, + ); + } + + const cases = await loadCasesFromDirectory(casesDir); + + expect(cases.map((c) => c.id)).toEqual(['alpha', 'bravo', 'charlie']); + }); + + it('sets workspace template from workspace/ subdirectory', async () => { + const casesDir = path.join(tempDir, 'workspace-dir'); + await mkdir(path.join(casesDir, 'my-case', 'workspace'), { recursive: true }); + + await writeFile( + path.join(casesDir, 'my-case', 'case.yaml'), + `criteria: "Goal" +input: "Input" +`, + ); + await writeFile(path.join(casesDir, 'my-case', 'workspace', 'file.txt'), 'content'); + + const cases = await loadCasesFromDirectory(casesDir); + + expect(cases).toHaveLength(1); + const ws = cases[0].workspace as { template: string }; + expect(ws.template).toBe(path.join(casesDir, 'my-case', 'workspace')); + }); + + it('does not override explicit workspace in case.yaml', async () => { + const casesDir = path.join(tempDir, 'ws-explicit'); + await mkdir(path.join(casesDir, 'my-case', 'workspace'), { recursive: true }); + + await writeFile( + path.join(casesDir, 'my-case', 'case.yaml'), + `criteria: "Goal" +input: "Input" +workspace: + template: /custom/path +`, + ); + + const cases = await loadCasesFromDirectory(casesDir); + + expect(cases).toHaveLength(1); + const ws = cases[0].workspace as { template: string }; + expect(ws.template).toBe('/custom/path'); + }); + + it('returns empty array for empty directory', async () => { + const casesDir = path.join(tempDir, 'empty-dir'); + await mkdir(casesDir, { recursive: true }); + + const cases = await loadCasesFromDirectory(casesDir); + + expect(cases).toHaveLength(0); + }); + + it('supports case.yml extension', async () => { + const casesDir = path.join(tempDir, 'yml-ext'); + await mkdir(path.join(casesDir, 'my-case'), { recursive: true }); + + await writeFile( + path.join(casesDir, 'my-case', 'case.yml'), + `criteria: "YML goal" +input: "YML input" +`, + ); + + const cases = await loadCasesFromDirectory(casesDir); + + expect(cases).toHaveLength(1); + expect(cases[0].id).toBe('my-case'); + expect(cases[0].criteria).toBe('YML goal'); + }); +}); + +describe('tests as directory path (integration)', () => { + let tempDir: string; + + beforeAll(async () => { + tempDir = path.join(os.tmpdir(), `agentv-dir-integration-${Date.now()}`); + await mkdir(tempDir, { recursive: true }); + }); + + afterAll(async () => { + await rm(tempDir, { recursive: true, force: true }); + }); + + it('loads tests from directory via tests: string path in eval YAML', async () => { + // Create directory structure + const casesDir = path.join(tempDir, 'cases'); + await mkdir(path.join(casesDir, 'fix-null-check'), { recursive: true }); + await mkdir(path.join(casesDir, 'add-greeting'), { recursive: true }); + + await writeFile( + path.join(casesDir, 'fix-null-check', 'case.yaml'), + `criteria: "Fixes the null check bug" +input: "Fix the null check in parser.ts" +`, + ); + await writeFile( + path.join(casesDir, 'add-greeting', 'case.yaml'), + `criteria: "Adds a greeting message" +input: "Add a greeting to the homepage" +`, + ); + + // Create eval YAML pointing to the directory + await writeFile( + path.join(tempDir, 'suite.eval.yaml'), + `name: dir-discovery-suite +description: Tests loaded from directory +tests: ./cases/ +`, + ); + + const result = await loadTestSuite(path.join(tempDir, 'suite.eval.yaml'), tempDir); + + expect(result.tests).toHaveLength(2); + expect(result.tests[0].id).toBe('add-greeting'); + expect(result.tests[1].id).toBe('fix-null-check'); + expect(result.metadata?.name).toBe('dir-discovery-suite'); + }); +}); From 208cfe6dfd69c55fd2480f0fc32cd736d05bc494 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Fri, 17 Apr 2026 13:30:35 +0000 Subject: [PATCH 2/3] style: fix biome formatting for chained method calls Co-Authored-By: Claude Opus 4.6 (1M context) --- packages/core/src/evaluation/loaders/case-file-loader.ts | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/packages/core/src/evaluation/loaders/case-file-loader.ts b/packages/core/src/evaluation/loaders/case-file-loader.ts index 7beebd09f..711500dd3 100644 --- a/packages/core/src/evaluation/loaders/case-file-loader.ts +++ b/packages/core/src/evaluation/loaders/case-file-loader.ts @@ -167,7 +167,9 @@ export async function resolveFileReference( */ export async function loadCasesFromDirectory(dirPath: string): Promise { const entries = await readdir(dirPath, { withFileTypes: true }); - const subdirs = entries.filter((e) => e.isDirectory()).sort((a, b) => a.name.localeCompare(b.name)); + const subdirs = entries + .filter((e) => e.isDirectory()) + .sort((a, b) => a.name.localeCompare(b.name)); const results: JsonObject[] = []; for (const subdir of subdirs) { From d103697ea57576b4b406b5b995a59c791f61b790 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Fri, 17 Apr 2026 13:38:11 +0000 Subject: [PATCH 3/3] fix(core): address code review findings for directory discovery - Update eval-validator to recognize directory paths (no false warning) - Use lexicographic sort instead of locale-dependent localeCompare - Use strict null check for id injection (not falsy check) Co-Authored-By: Claude Opus 4.6 (1M context) --- .../evaluation/loaders/case-file-loader.ts | 4 +- .../evaluation/validation/eval-validator.ts | 55 +++++++++++++++---- 2 files changed, 46 insertions(+), 13 deletions(-) diff --git a/packages/core/src/evaluation/loaders/case-file-loader.ts b/packages/core/src/evaluation/loaders/case-file-loader.ts index 711500dd3..0568d1320 100644 --- a/packages/core/src/evaluation/loaders/case-file-loader.ts +++ b/packages/core/src/evaluation/loaders/case-file-loader.ts @@ -169,7 +169,7 @@ export async function loadCasesFromDirectory(dirPath: string): Promise e.isDirectory()) - .sort((a, b) => a.name.localeCompare(b.name)); + .sort((a, b) => (a.name < b.name ? -1 : a.name > b.name ? 1 : 0)); const results: JsonObject[] = []; for (const subdir of subdirs) { @@ -217,7 +217,7 @@ export async function loadCasesFromDirectory(dirPath: string): Promise