Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 27 additions & 8 deletions apps/cli/src/commands/validate/validate-files.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import {
type ValidationResult,
type ValidationSummary,
detectFileType,
validateCasesFile,
validateConfigFile,
validateEvalFile,
validateFileReferences,
Expand All @@ -17,12 +18,7 @@ import fg from 'fast-glob';
*/
export async function validateFiles(paths: readonly string[]): Promise<ValidationSummary> {
const filePaths = await expandPaths(paths);
const results: ValidationResult[] = [];

for (const filePath of filePaths) {
const result = await validateSingleFile(filePath);
results.push(result);
}
const results = await Promise.all(filePaths.map((filePath) => validateSingleFile(filePath)));

const validFiles = results.filter((r) => r.valid).length;
const invalidFiles = results.filter((r) => !r.valid).length;
Expand Down Expand Up @@ -58,10 +54,27 @@ async function validateSingleFile(filePath: string): Promise<ValidationResult> {
};
}
}
} else if (fileType === 'cases') {
result = await validateCasesFile(absolutePath);
} else if (fileType === 'targets') {
result = await validateTargetsFile(absolutePath);
} else {
} else if (fileType === 'config') {
result = await validateConfigFile(absolutePath);
} else {
// Unknown file type — skip validation, report as skipped
result = {
valid: true,
filePath: absolutePath,
fileType: 'unknown',
errors: [
{
severity: 'warning',
filePath: absolutePath,
message:
'File type not recognized. Eval files must end in .eval.yaml. Skipping validation.',
},
],
};
}

return result;
Expand Down Expand Up @@ -130,7 +143,7 @@ async function findYamlFiles(dirPath: string): Promise<readonly string[]> {
}
const subFiles = await findYamlFiles(fullPath);
results.push(...subFiles);
} else if (entry.isFile() && isYamlFile(entry.name)) {
} else if (entry.isFile() && isEvalYamlFile(entry.name)) {
results.push(fullPath);
}
}
Expand All @@ -145,3 +158,9 @@ function isYamlFile(filePath: string): boolean {
const ext = path.extname(filePath).toLowerCase();
return ext === '.yaml' || ext === '.yml';
}

/** Returns true only for *.eval.yaml / *.eval.yml files (used for directory scanning). */
function isEvalYamlFile(filePath: string): boolean {
const lower = path.basename(filePath).toLowerCase();
return lower.endsWith('.eval.yaml') || lower.endsWith('.eval.yml');
}
2 changes: 0 additions & 2 deletions examples/features/basic-jsonl/evals/dataset.eval.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,4 @@ name: basic-jsonl
execution:
target: llm

evaluator: llm_grader

tests: ./dataset.jsonl
10 changes: 6 additions & 4 deletions examples/features/prompt-template-sdk/evals/dataset.eval.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,9 @@ tests:
- type: text
value: What are the main benefits of TypeScript over JavaScript?

reference_answer: |-
TypeScript provides static type checking, better IDE support, and improved maintainability.
expected_output:
- role: assistant
content: TypeScript provides static type checking, better IDE support, and improved maintainability.

assertions:
- name: custom-prompt-eval
Expand All @@ -37,8 +38,9 @@ tests:
- type: text
value: Explain async/await in JavaScript.

reference_answer: |-
Async/await is syntactic sugar over Promises that makes asynchronous code look synchronous.
expected_output:
- role: assistant
content: Async/await is syntactic sugar over Promises that makes asynchronous code look synchronous.

assertions:
- name: strict-eval
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,8 @@ tests:
# Expected score: 1.0 (webSearch before fetchPage is satisfied)
# =============================================================================
- id: in-order-validation
description: |-
Validates that the agent performs web search before fetching page details.
Mode 'in_order' allows other tool calls between expected tools.

# Validates that the agent performs web search before fetching page details.
# Mode 'in_order' allows other tool calls between expected tools.
criteria: |-
Agent searches for product information, then fetches detailed specs.

Expand All @@ -49,10 +47,8 @@ tests:
# Expected score: 1.0 (matches full trace exactly)
# =============================================================================
- id: exact-sequence-validation
description: |-
Validates the exact sequence of all tool calls in the trace.
Mode 'exact' requires the trace to match precisely.

# Validates the exact sequence of all tool calls in the trace.
# Mode 'exact' requires the trace to match precisely.
criteria: |-
Agent follows the exact research workflow: search, fetch, search reviews, summarize.

Expand All @@ -76,10 +72,8 @@ tests:
# Expected score: 1.0 (meets all minimums)
# =============================================================================
- id: any-order-with-minimums
description: |-
Validates that the agent performs adequate research by checking minimum
tool call counts. Mode 'any_order' with minimums is flexible on sequence.

# Validates that the agent performs adequate research by checking minimum
# tool call counts. Mode 'any_order' with minimums is flexible on sequence.
criteria: |-
Agent performs at least 2 web searches and 1 page fetch for thorough research.

Expand All @@ -101,10 +95,8 @@ tests:
# Expected score: 1.0 (inputs match expected patterns)
# =============================================================================
- id: tool-input-validation
description: |-
Validates that tool calls include appropriate input parameters.
Useful for ensuring the agent provides correct context to tools.

# Validates that tool calls include appropriate input parameters.
# Useful for ensuring the agent provides correct context to tools.
criteria: |-
Agent searches with relevant product keywords and fetches from authoritative source.

Expand All @@ -130,10 +122,8 @@ tests:
# Expected score: 1.0 (outputs contain expected fields)
# =============================================================================
- id: tool-output-validation
description: |-
Validates that tool outputs contain expected data.
Useful for regression testing when tool behavior changes.

# Validates that tool outputs contain expected data.
# Useful for regression testing when tool behavior changes.
criteria: |-
Web search returns results with links and snippets; fetch returns product specs.

Expand Down Expand Up @@ -161,10 +151,8 @@ tests:
# This mirrors patterns used in complex agent pipelines
# =============================================================================
- id: combined-validation
description: |-
Production-style evaluation combining sequence validation with input/output
checks. Demonstrates a realistic multi-turn agent workflow.

# Production-style evaluation combining sequence validation with input/output
# checks. Demonstrates a realistic multi-turn agent workflow.
criteria: |-
Agent performs comprehensive product research:
1. Initial web search for product specs
Expand Down
2 changes: 1 addition & 1 deletion examples/showcase/tool-evaluation-plugins/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ evaluators:
export TOOL_EVAL_PLUGINS_DIR=$(pwd)/examples/showcase/tool-evaluation-plugins

# Run the demo
npx agentv eval examples/showcase/tool-evaluation-plugins/tool-eval-demo.yaml
npx agentv eval examples/showcase/tool-evaluation-plugins/tool-eval-demo.eval.yaml
```

## Input Contract
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
# semantic evaluation capabilities that require domain-specific logic.
#
# Run: cd examples/showcase/tool-evaluation-plugins
# npx agentv eval tool-eval-demo.yaml --target mock_agent
# npx agentv eval tool-eval-demo.eval.yaml --target mock_agent

description: Showcase of tool evaluation plugin patterns

Expand Down
98 changes: 98 additions & 0 deletions packages/core/src/evaluation/validation/cases-validator.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
import { readFile } from 'node:fs/promises';
import path from 'node:path';
import { parse } from 'yaml';

import type { ValidationError, ValidationResult } from './types.js';

type JsonValue = string | number | boolean | null | JsonObject | JsonArray;
type JsonObject = { readonly [key: string]: JsonValue };
type JsonArray = readonly JsonValue[];

function isObject(value: unknown): value is JsonObject {
return typeof value === 'object' && value !== null && !Array.isArray(value);
}

/**
* Validate a cases file — a YAML file whose root is an array of test case objects.
*
* Cases files are referenced from eval files via `tests: path/to/cases.yaml` or
* `file://cases/accuracy.yaml` entries in the tests array. Each item must have
* at least an `id` (non-empty string) and an `input` (string or array).
*/
export async function validateCasesFile(filePath: string): Promise<ValidationResult> {
const errors: ValidationError[] = [];
const absolutePath = path.resolve(filePath);

let parsed: unknown;
try {
const content = await readFile(absolutePath, 'utf8');
parsed = parse(content);
} catch (error) {
errors.push({
severity: 'error',
filePath: absolutePath,
message: `Failed to parse YAML: ${(error as Error).message}`,
});
return { valid: false, filePath: absolutePath, fileType: 'cases', errors };
}

if (!Array.isArray(parsed)) {
errors.push({
severity: 'error',
filePath: absolutePath,
message: 'Cases file must contain a YAML array of test case objects',
});
return { valid: false, filePath: absolutePath, fileType: 'cases', errors };
}

for (let i = 0; i < parsed.length; i++) {
const item = parsed[i];
const location = `[${i}]`;

if (!isObject(item)) {
errors.push({
severity: 'error',
filePath: absolutePath,
location,
message: 'Each test case must be an object',
});
continue;
}

// Required: id
const id = item.id;
if (typeof id !== 'string' || id.trim().length === 0) {
errors.push({
severity: 'error',
filePath: absolutePath,
location: `${location}.id`,
message: "Missing or invalid 'id' field (must be a non-empty string)",
});
}

// Required: input
const input = item.input;
if (input === undefined) {
errors.push({
severity: 'error',
filePath: absolutePath,
location: `${location}.input`,
message: "Missing 'input' field (must be a string or array of messages)",
});
} else if (typeof input !== 'string' && !Array.isArray(input)) {
errors.push({
severity: 'error',
filePath: absolutePath,
location: `${location}.input`,
message: "Invalid 'input' field (must be a string or array of messages)",
});
}
}

return {
valid: errors.filter((e) => e.severity === 'error').length === 0,
filePath: absolutePath,
fileType: 'cases',
errors,
};
}
Original file line number Diff line number Diff line change
Expand Up @@ -382,7 +382,6 @@ const EvalTestSchema = z.object({
metadata: z.record(z.unknown()).optional(),
conversation_id: z.string().optional(),
suite: z.string().optional(),
note: z.string().optional(),
depends_on: z.array(z.string()).optional(),
on_dependency_failure: z.enum(['skip', 'fail', 'run']).optional(),
mode: z.enum(['conversation']).optional(),
Expand Down
Loading
Loading