EntityProcess · christso · Apr 13, 2026 · Apr 13, 2026 · Apr 13, 2026 · Apr 13, 2026
diff --git a/apps/cli/src/commands/validate/validate-files.ts b/apps/cli/src/commands/validate/validate-files.ts
@@ -5,6 +5,7 @@ import {
   type ValidationResult,
   type ValidationSummary,
   detectFileType,
+  validateCasesFile,
   validateConfigFile,
   validateEvalFile,
   validateFileReferences,
@@ -17,12 +18,7 @@ import fg from 'fast-glob';
  */
 export async function validateFiles(paths: readonly string[]): Promise<ValidationSummary> {
   const filePaths = await expandPaths(paths);
-  const results: ValidationResult[] = [];
-
-  for (const filePath of filePaths) {
-    const result = await validateSingleFile(filePath);
-    results.push(result);
-  }
+  const results = await Promise.all(filePaths.map((filePath) => validateSingleFile(filePath)));
 
   const validFiles = results.filter((r) => r.valid).length;
   const invalidFiles = results.filter((r) => !r.valid).length;
@@ -58,10 +54,27 @@ async function validateSingleFile(filePath: string): Promise<ValidationResult> {
         };
       }
     }
+  } else if (fileType === 'cases') {
+    result = await validateCasesFile(absolutePath);
   } else if (fileType === 'targets') {
     result = await validateTargetsFile(absolutePath);
-  } else {
+  } else if (fileType === 'config') {
     result = await validateConfigFile(absolutePath);
+  } else {
+    // Unknown file type — skip validation, report as skipped
+    result = {
+      valid: true,
+      filePath: absolutePath,
+      fileType: 'unknown',
+      errors: [
+        {
+          severity: 'warning',
+          filePath: absolutePath,
+          message:
+            'File type not recognized. Eval files must end in .eval.yaml. Skipping validation.',
+        },
+      ],
+    };
   }
 
   return result;
@@ -130,7 +143,7 @@ async function findYamlFiles(dirPath: string): Promise<readonly string[]> {
         }
         const subFiles = await findYamlFiles(fullPath);
         results.push(...subFiles);
-      } else if (entry.isFile() && isYamlFile(entry.name)) {
+      } else if (entry.isFile() && isEvalYamlFile(entry.name)) {
         results.push(fullPath);
       }
     }
@@ -145,3 +158,9 @@ function isYamlFile(filePath: string): boolean {
   const ext = path.extname(filePath).toLowerCase();
   return ext === '.yaml' || ext === '.yml';
 }
+
+/** Returns true only for *.eval.yaml / *.eval.yml files (used for directory scanning). */
+function isEvalYamlFile(filePath: string): boolean {
+  const lower = path.basename(filePath).toLowerCase();
+  return lower.endsWith('.eval.yaml') || lower.endsWith('.eval.yml');
+}
diff --git a/examples/features/basic-jsonl/evals/dataset.eval.yaml b/examples/features/basic-jsonl/evals/dataset.eval.yaml
@@ -7,6 +7,4 @@ name: basic-jsonl
 execution:
   target: llm
 
-evaluator: llm_grader
-
 tests: ./dataset.jsonl
diff --git a/examples/features/prompt-template-sdk/evals/dataset.eval.yaml b/examples/features/prompt-template-sdk/evals/dataset.eval.yaml
@@ -18,8 +18,9 @@ tests:
           - type: text
             value: What are the main benefits of TypeScript over JavaScript?
 
-    reference_answer: |-
-      TypeScript provides static type checking, better IDE support, and improved maintainability.
+    expected_output:
+      - role: assistant
+        content: TypeScript provides static type checking, better IDE support, and improved maintainability.
 
     assertions:
       - name: custom-prompt-eval
@@ -37,8 +38,9 @@ tests:
           - type: text
             value: Explain async/await in JavaScript.
 
-    reference_answer: |-
-      Async/await is syntactic sugar over Promises that makes asynchronous code look synchronous.
+    expected_output:
+      - role: assistant
+        content: Async/await is syntactic sugar over Promises that makes asynchronous code look synchronous.
 
     assertions:
       - name: strict-eval

diff --git a/examples/features/tool-trajectory-advanced/evals/trace-file-demo.eval.yaml b/examples/features/tool-trajectory-advanced/evals/trace-file-demo.eval.yaml
@@ -23,10 +23,8 @@ tests:
   # Expected score: 1.0 (webSearch before fetchPage is satisfied)
   # =============================================================================
   - id: in-order-validation
-    description: |-
-      Validates that the agent performs web search before fetching page details.
-      Mode 'in_order' allows other tool calls between expected tools.
-
+    # Validates that the agent performs web search before fetching page details.
+    # Mode 'in_order' allows other tool calls between expected tools.
     criteria: |-
       Agent searches for product information, then fetches detailed specs.
 
@@ -49,10 +47,8 @@ tests:
   # Expected score: 1.0 (matches full trace exactly)
   # =============================================================================
   - id: exact-sequence-validation
-    description: |-
-      Validates the exact sequence of all tool calls in the trace.
-      Mode 'exact' requires the trace to match precisely.
-
+    # Validates the exact sequence of all tool calls in the trace.
+    # Mode 'exact' requires the trace to match precisely.
     criteria: |-
       Agent follows the exact research workflow: search, fetch, search reviews, summarize.
 
@@ -76,10 +72,8 @@ tests:
   # Expected score: 1.0 (meets all minimums)
   # =============================================================================
   - id: any-order-with-minimums
-    description: |-
-      Validates that the agent performs adequate research by checking minimum
-      tool call counts. Mode 'any_order' with minimums is flexible on sequence.
-
+    # Validates that the agent performs adequate research by checking minimum
+    # tool call counts. Mode 'any_order' with minimums is flexible on sequence.
     criteria: |-
       Agent performs at least 2 web searches and 1 page fetch for thorough research.
 
@@ -101,10 +95,8 @@ tests:
   # Expected score: 1.0 (inputs match expected patterns)
   # =============================================================================
   - id: tool-input-validation
-    description: |-
-      Validates that tool calls include appropriate input parameters.
-      Useful for ensuring the agent provides correct context to tools.
-
+    # Validates that tool calls include appropriate input parameters.
+    # Useful for ensuring the agent provides correct context to tools.
     criteria: |-
       Agent searches with relevant product keywords and fetches from authoritative source.
 
@@ -130,10 +122,8 @@ tests:
   # Expected score: 1.0 (outputs contain expected fields)
   # =============================================================================
   - id: tool-output-validation
-    description: |-
-      Validates that tool outputs contain expected data.
-      Useful for regression testing when tool behavior changes.
-
+    # Validates that tool outputs contain expected data.
+    # Useful for regression testing when tool behavior changes.
     criteria: |-
       Web search returns results with links and snippets; fetch returns product specs.
 
@@ -161,10 +151,8 @@ tests:
   # This mirrors patterns used in complex agent pipelines
   # =============================================================================
   - id: combined-validation
-    description: |-
-      Production-style evaluation combining sequence validation with input/output
-      checks. Demonstrates a realistic multi-turn agent workflow.
-
+    # Production-style evaluation combining sequence validation with input/output
+    # checks. Demonstrates a realistic multi-turn agent workflow.
     criteria: |-
       Agent performs comprehensive product research:
       1. Initial web search for product specs

diff --git a/examples/showcase/tool-evaluation-plugins/README.md b/examples/showcase/tool-evaluation-plugins/README.md
@@ -56,7 +56,7 @@ evaluators:
 export TOOL_EVAL_PLUGINS_DIR=$(pwd)/examples/showcase/tool-evaluation-plugins
 
 # Run the demo
-npx agentv eval examples/showcase/tool-evaluation-plugins/tool-eval-demo.yaml
+npx agentv eval examples/showcase/tool-evaluation-plugins/tool-eval-demo.eval.yaml
 ```
 
 ## Input Contract

diff --git a/...ol-evaluation-plugins/tool-eval-demo.yaml → ...aluation-plugins/tool-eval-demo.eval.yaml b/...ol-evaluation-plugins/tool-eval-demo.yaml → ...aluation-plugins/tool-eval-demo.eval.yaml
@@ -5,7 +5,7 @@
 # semantic evaluation capabilities that require domain-specific logic.
 #
 # Run: cd examples/showcase/tool-evaluation-plugins
-#      npx agentv eval tool-eval-demo.yaml --target mock_agent
+#      npx agentv eval tool-eval-demo.eval.yaml --target mock_agent
 
 description: Showcase of tool evaluation plugin patterns
 

diff --git a/packages/core/src/evaluation/validation/cases-validator.ts b/packages/core/src/evaluation/validation/cases-validator.ts
@@ -0,0 +1,98 @@
+import { readFile } from 'node:fs/promises';
+import path from 'node:path';
+import { parse } from 'yaml';
+
+import type { ValidationError, ValidationResult } from './types.js';
+
+type JsonValue = string | number | boolean | null | JsonObject | JsonArray;
+type JsonObject = { readonly [key: string]: JsonValue };
+type JsonArray = readonly JsonValue[];
+
+function isObject(value: unknown): value is JsonObject {
+  return typeof value === 'object' && value !== null && !Array.isArray(value);
+}
+
+/**
+ * Validate a cases file — a YAML file whose root is an array of test case objects.
+ *
+ * Cases files are referenced from eval files via `tests: path/to/cases.yaml` or
+ * `file://cases/accuracy.yaml` entries in the tests array. Each item must have
+ * at least an `id` (non-empty string) and an `input` (string or array).
+ */
+export async function validateCasesFile(filePath: string): Promise<ValidationResult> {
+  const errors: ValidationError[] = [];
+  const absolutePath = path.resolve(filePath);
+
+  let parsed: unknown;
+  try {
+    const content = await readFile(absolutePath, 'utf8');
+    parsed = parse(content);
+  } catch (error) {
+    errors.push({
+      severity: 'error',
+      filePath: absolutePath,
+      message: `Failed to parse YAML: ${(error as Error).message}`,
+    });
+    return { valid: false, filePath: absolutePath, fileType: 'cases', errors };
+  }
+
+  if (!Array.isArray(parsed)) {
+    errors.push({
+      severity: 'error',
+      filePath: absolutePath,
+      message: 'Cases file must contain a YAML array of test case objects',
+    });
+    return { valid: false, filePath: absolutePath, fileType: 'cases', errors };
+  }
+
+  for (let i = 0; i < parsed.length; i++) {
+    const item = parsed[i];
+    const location = `[${i}]`;
+
+    if (!isObject(item)) {
+      errors.push({
+        severity: 'error',
+        filePath: absolutePath,
+        location,
+        message: 'Each test case must be an object',
+      });
+      continue;
+    }
+
+    // Required: id
+    const id = item.id;
+    if (typeof id !== 'string' || id.trim().length === 0) {
+      errors.push({
+        severity: 'error',
+        filePath: absolutePath,
+        location: `${location}.id`,
+        message: "Missing or invalid 'id' field (must be a non-empty string)",
+      });
+    }
+
+    // Required: input
+    const input = item.input;
+    if (input === undefined) {
+      errors.push({
+        severity: 'error',
+        filePath: absolutePath,
+        location: `${location}.input`,
+        message: "Missing 'input' field (must be a string or array of messages)",
+      });
+    } else if (typeof input !== 'string' && !Array.isArray(input)) {
+      errors.push({
+        severity: 'error',
+        filePath: absolutePath,
+        location: `${location}.input`,
+        message: "Invalid 'input' field (must be a string or array of messages)",
+      });
+    }
+  }
+
+  return {
+    valid: errors.filter((e) => e.severity === 'error').length === 0,
+    filePath: absolutePath,
+    fileType: 'cases',
+    errors,
+  };
+}
diff --git a/packages/core/src/evaluation/validation/eval-file.schema.ts b/packages/core/src/evaluation/validation/eval-file.schema.ts
@@ -382,7 +382,6 @@ const EvalTestSchema = z.object({
   metadata: z.record(z.unknown()).optional(),
   conversation_id: z.string().optional(),
   suite: z.string().optional(),
-  note: z.string().optional(),
   depends_on: z.array(z.string()).optional(),
   on_dependency_failure: z.enum(['skip', 'fail', 'run']).optional(),
   mode: z.enum(['conversation']).optional(),