Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 41 additions & 1 deletion apps/web/src/content/docs/docs/evaluation/eval-files.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ tests:
| `suite` | Optional suite identifier |
| `execution` | Default execution config (`target`, `fail_on_error`, `threshold`, etc.) |
| `workspace` | Suite-level workspace config — inline object or string path to an [external workspace file](/docs/guides/workspace-pool/#external-workspace-config) |
| `tests` | Array of individual tests, or a string path to an external file |
| `tests` | Array of individual tests, or a string path to an external file or directory |
| `assertions` | Suite-level graders appended to each test unless `execution.skip_defaults: true` is set on the test |
| `input` | Suite-level input messages prepended to each test's input unless `execution.skip_defaults: true` is set on the test |

Expand Down Expand Up @@ -178,6 +178,46 @@ tests: ./cases.yaml

The path is resolved relative to the eval file's directory. The external file should contain a YAML array of test objects or a JSONL file with one test per line.

### Tests as Directory Path

When `tests` points to a directory, AgentV auto-discovers test cases from subdirectories. Each subdirectory containing a `case.yaml` (or `case.yml`) becomes a test case:

```
my-eval/
EVAL.yaml
cases/
fix-null-check/
case.yaml
add-greeting/
case.yaml
workspace/ # optional per-case workspace template
setup-files...
```

```yaml
# EVAL.yaml
name: my-benchmark
tests: ./cases/
```

Each `case.yaml` is a single YAML object (not an array) with the same fields as an inline test:

```yaml
# cases/fix-null-check/case.yaml
criteria: Fixes the null reference bug in the parser module
input: Fix the null check bug in parser.ts
```

**Behavior:**

- **Directory name as `id`:** If `case.yaml` doesn't specify an `id`, the directory name is used (e.g., `fix-null-check`)
- **Alphabetical ordering:** Subdirectories are sorted alphabetically for deterministic order
- **Per-case workspace:** A `workspace/` subdirectory inside the case directory automatically sets `workspace.template` to that path, unless the case already defines a `workspace` field
- **Skipped directories:** Subdirectories without `case.yaml` are skipped with a warning
- **Suite-level config applies:** Suite-level `assertions`, `input`, `workspace`, and `execution` still apply to directory-discovered cases

This pattern is useful for benchmarks with many cases, where each case benefits from its own directory for workspace templates, supporting files, or documentation.

## Environment Variable Interpolation

All string fields in eval files support `${{ VAR }}` syntax for environment variable interpolation. This enables portable eval configs that work across machines and CI environments without hardcoded paths.
Expand Down
4 changes: 4 additions & 0 deletions examples/showcase/directory-discovery/EVAL.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
name: directory-discovery
description: Demonstrates auto-discovering test cases from a directory structure

tests: ./cases/
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
criteria: Adds a greeting message that displays the user's name
input: |
Add a greeting feature to the homepage. When a user logs in,
display "Welcome back, {name}!" at the top of the page.
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
criteria: Identifies and fixes the null reference bug in the parser module
input: |
Fix the null check bug in parser.ts. The function `parseToken` crashes
when given an empty string because it doesn't check for null before
accessing `.length`.
84 changes: 83 additions & 1 deletion packages/core/src/evaluation/loaders/case-file-loader.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { readFile } from 'node:fs/promises';
import { readFile, readdir, stat } from 'node:fs/promises';
import path from 'node:path';
import fg from 'fast-glob';
import { parse as parseYaml } from 'yaml';
Expand Down Expand Up @@ -158,6 +158,88 @@ export async function resolveFileReference(
return loadCasesFromFile(absolutePattern);
}

/**
* Load test cases from a directory structure.
* Scans immediate subdirectories for case.yaml/case.yml files.
* Each subdirectory becomes a test case, with the directory name used as `id`
* if the case file doesn't specify one. A `workspace/` subdirectory in the
* case directory sets the workspace template automatically.
*/
export async function loadCasesFromDirectory(dirPath: string): Promise<JsonObject[]> {
const entries = await readdir(dirPath, { withFileTypes: true });
const subdirs = entries
.filter((e) => e.isDirectory())
.sort((a, b) => (a.name < b.name ? -1 : a.name > b.name ? 1 : 0));

const results: JsonObject[] = [];
for (const subdir of subdirs) {
const subdirPath = path.join(dirPath, subdir.name);

// Look for case.yaml or case.yml
let caseFilePath: string | undefined;
for (const filename of ['case.yaml', 'case.yml']) {
const candidate = path.join(subdirPath, filename);
try {
const s = await stat(candidate);
if (s.isFile()) {
caseFilePath = candidate;
break;
}
} catch {
// File doesn't exist, try next
}
}

if (!caseFilePath) {
console.warn(
`${ANSI_YELLOW}Warning: Skipping directory '${subdir.name}' — no case.yaml found${ANSI_RESET}`,
);
continue;
}

// Parse case.yaml as a single object (not array)
let content: string;
try {
content = await readFile(caseFilePath, 'utf8');
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
throw new Error(`Cannot read case file: ${caseFilePath}\n ${message}`);
}

const raw = parseYaml(content) as unknown;
const parsed = interpolateEnv(raw, process.env);
if (!isJsonObject(parsed)) {
throw new Error(
`Case file must contain a YAML object, got ${typeof parsed}: ${caseFilePath}`,
);
}

const caseObj = { ...parsed };

// Inject id from directory name if not specified
if (caseObj.id === undefined || caseObj.id === null) {
caseObj.id = subdir.name;
}

// Check for workspace/ subdirectory
if (!caseObj.workspace) {
const workspaceDirPath = path.join(subdirPath, 'workspace');
try {
const s = await stat(workspaceDirPath);
if (s.isDirectory()) {
caseObj.workspace = { template: workspaceDirPath };
}
} catch {
// No workspace directory, that's fine
}
}

results.push(caseObj);
}

return results;
}

/**
* Process a tests array, expanding any file:// references into inline test objects.
* Returns a flat array of JsonValue where all file:// strings are replaced
Expand Down
55 changes: 44 additions & 11 deletions packages/core/src/evaluation/validation/eval-validator.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import { readFile, readdir } from 'node:fs/promises';
import { readFile, readdir, stat } from 'node:fs/promises';
import path from 'node:path';
import { parse } from 'yaml';

import { interpolateEnv } from '../interpolation.js';
import { loadCasesFromFile } from '../loaders/case-file-loader.js';
import { loadCasesFromDirectory, loadCasesFromFile } from '../loaders/case-file-loader.js';
import { isGraderKind } from '../types.js';
import type { ValidationError, ValidationResult } from './types.js';

Expand Down Expand Up @@ -234,20 +234,27 @@ export async function validateEvalFile(filePath: string): Promise<ValidationResu

const cases: JsonValue | undefined = parsed.tests;

// tests can be a string path (external file reference) or an array
// tests can be a string path (external file/directory reference) or an array
if (typeof cases === 'string') {
validateTestsStringPath(cases, absolutePath, errors);
await validateWorkspaceConfig(parsed.workspace, absolutePath, errors, 'workspace');

const ext = path.extname(cases).toLowerCase();
if (VALID_TEST_FILE_EXTENSIONS.has(ext)) {
const externalCasesPath = path.resolve(path.dirname(absolutePath), cases);
const externalCasesPath = path.resolve(path.dirname(absolutePath), cases);
let isDir = false;
try {
const pathStat = await stat(externalCasesPath);
isDir = pathStat.isDirectory();
} catch {
// Path doesn't exist — fall through to file validation
}

if (isDir) {
// Directory path: load and validate discovered cases
try {
const externalCases = await loadCasesFromFile(externalCasesPath);
for (let i = 0; i < externalCases.length; i++) {
const externalCase = externalCases[i];
const dirCases = await loadCasesFromDirectory(externalCasesPath);
for (let i = 0; i < dirCases.length; i++) {
const dirCase = dirCases[i];
await validateWorkspaceConfig(
externalCase.workspace,
dirCase.workspace,
absolutePath,
errors,
`tests[${i}].workspace`,
Expand All @@ -262,6 +269,32 @@ export async function validateEvalFile(filePath: string): Promise<ValidationResu
message,
});
}
} else {
// File path: validate extension and load
validateTestsStringPath(cases, absolutePath, errors);
const ext = path.extname(cases).toLowerCase();
if (VALID_TEST_FILE_EXTENSIONS.has(ext)) {
try {
const externalCases = await loadCasesFromFile(externalCasesPath);
for (let i = 0; i < externalCases.length; i++) {
const externalCase = externalCases[i];
await validateWorkspaceConfig(
externalCase.workspace,
absolutePath,
errors,
`tests[${i}].workspace`,
);
}
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
errors.push({
severity: 'error',
filePath: absolutePath,
location: 'tests',
message,
});
}
}
}

return {
Expand Down
24 changes: 19 additions & 5 deletions packages/core/src/evaluation/yaml-parser.ts
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
import { readFile } from 'node:fs/promises';
import { readFile, stat } from 'node:fs/promises';
import path from 'node:path';
import micromatch from 'micromatch';
import { parse } from 'yaml';

import { collectResolvedInputFilePaths } from './input-message-utils.js';
import { interpolateEnv } from './interpolation.js';
import { loadTestsFromAgentSkills } from './loaders/agent-skills-parser.js';
import { expandFileReferences, loadCasesFromFile } from './loaders/case-file-loader.js';
import {
expandFileReferences,
loadCasesFromDirectory,
loadCasesFromFile,
} from './loaders/case-file-loader.js';
import {
extractBudgetUsd,
extractCacheConfig,
Expand Down Expand Up @@ -332,12 +336,22 @@ async function loadTestsFromYaml(
// Parse suite-level workspace config (default for all cases)
const evalFileDir = path.dirname(absoluteTestPath);

// Resolve tests: string path to external file, inline array, or error
// Resolve tests: string path to external file/directory, inline array, or error
let expandedTestCases: readonly JsonValue[];
if (typeof rawTestCases === 'string') {
// String path: load tests from external file (YAML, JSONL)
const externalPath = path.resolve(evalFileDir, rawTestCases);
expandedTestCases = await loadCasesFromFile(externalPath);
let isDir = false;
try {
const pathStat = await stat(externalPath);
isDir = pathStat.isDirectory();
} catch {
// Path doesn't exist — fall through to loadCasesFromFile for its error message
}
if (isDir) {
expandedTestCases = await loadCasesFromDirectory(externalPath);
} else {
expandedTestCases = await loadCasesFromFile(externalPath);
}
} else if (Array.isArray(rawTestCases)) {
// Inline array: expand any file:// references
expandedTestCases = await expandFileReferences(rawTestCases, evalFileDir);
Expand Down
Loading
Loading