From 1513e7965f83779715528db0a7bd238a3021d2fc Mon Sep 17 00:00:00 2001 From: Christopher Date: Thu, 16 Apr 2026 00:20:39 +0000 Subject: [PATCH 1/2] feat(cli): add *.eval.ts auto-discovery (#1116) Add TypeScript eval file support to `agentv run`. TS eval files export an EvalConfig (default, `config`, or `evalConfig` named export) and are discovered alongside YAML files via the same glob/path resolution. Changes: - shared.ts: Include .ts in file extension regex and directory auto-glob - config-loader.ts: Add **/evals/**/*.eval.ts to DEFAULT_EVAL_PATTERNS - jsonl-parser.ts: Add typescript format detection in detectFormat() - ts-eval-loader.ts: New loader that imports TS modules and extracts EvalConfig - run-eval.ts: Integrate TS files through evaluate() with CLI overrides, feeding results through the same artifact/reporting pipeline - run.ts: Update CLI description to mention .ts files - index.ts: Export loadTsEvalFile and TsEvalResult from @agentv/core Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- apps/cli/src/commands/eval/commands/run.ts | 2 +- apps/cli/src/commands/eval/run-eval.ts | 71 ++++++++++++++----- apps/cli/src/commands/eval/shared.ts | 11 +-- apps/cli/test/commands/eval/shared.test.ts | 37 ++++++++++ .../src/evaluation/loaders/config-loader.ts | 1 + .../src/evaluation/loaders/jsonl-parser.ts | 7 +- .../src/evaluation/loaders/ts-eval-loader.ts | 58 +++++++++++++++ packages/core/src/index.ts | 4 ++ .../loaders/fixtures/default-export.eval.ts | 14 ++++ .../fixtures/eval-config-named.eval.ts | 12 ++++ .../loaders/fixtures/named-config.eval.ts | 12 ++++ .../loaders/fixtures/no-config.eval.ts | 2 + .../evaluation/loaders/jsonl-parser.test.ts | 9 +++ .../evaluation/loaders/ts-eval-loader.test.ts | 39 ++++++++++ 14 files changed, 255 insertions(+), 24 deletions(-) create mode 100644 packages/core/src/evaluation/loaders/ts-eval-loader.ts create mode 100644 packages/core/test/evaluation/loaders/fixtures/default-export.eval.ts create mode 100644 packages/core/test/evaluation/loaders/fixtures/eval-config-named.eval.ts create mode 100644 packages/core/test/evaluation/loaders/fixtures/named-config.eval.ts create mode 100644 packages/core/test/evaluation/loaders/fixtures/no-config.eval.ts create mode 100644 packages/core/test/evaluation/loaders/ts-eval-loader.test.ts diff --git a/apps/cli/src/commands/eval/commands/run.ts b/apps/cli/src/commands/eval/commands/run.ts index 18668aa53..3c4631f27 100644 --- a/apps/cli/src/commands/eval/commands/run.ts +++ b/apps/cli/src/commands/eval/commands/run.ts @@ -20,7 +20,7 @@ export const evalRunCommand = command({ evalPaths: restPositionals({ type: string, displayName: 'eval-paths', - description: 'Path(s) or glob(s) to evaluation .yaml file(s)', + description: 'Path(s) or glob(s) to evaluation files (.yaml, .eval.ts)', }), target: multioption({ type: array(string), diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index a69078be0..5e84d40df 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -17,9 +17,11 @@ import { runEvaluation as defaultRunEvaluation, deriveCategory, ensureVSCodeSubagents, + evaluate, loadConfig, loadTestSuite, loadTsConfig, + loadTsEvalFile, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToCodexLogEntries, @@ -1172,30 +1174,18 @@ export async function runEvalCommand( readonly tags?: readonly string[]; } >(); - // Separate TypeScript/JS eval files from YAML files. - // TS files are self-contained scripts that call evaluate() directly. + // Separate TypeScript eval files from YAML/JSONL files. + // TS files export an EvalConfig and run through evaluate(). const tsFiles: string[] = []; const yamlFiles: string[] = []; for (const testFilePath of resolvedTestFiles) { - if (/\.(ts|js|mts|mjs)$/.test(testFilePath)) { + if (/\.(ts|mts)$/.test(testFilePath)) { tsFiles.push(testFilePath); } else { yamlFiles.push(testFilePath); } } - // Run TypeScript eval files by importing them. - // evaluate() runs during import via top-level await and handles its own output. - for (const tsFile of tsFiles) { - await ensureFileExists(tsFile, 'TypeScript eval file'); - await import(pathToFileURL(tsFile).href); - } - - // If only TS files were provided, we're done — evaluate() handled everything. - if (yamlFiles.length === 0 && tsFiles.length > 0) { - return; - } - for (const testFilePath of yamlFiles) { const meta = await prepareFileMetadata({ testFilePath, @@ -1287,7 +1277,7 @@ export async function runEvalCommand( } } - if (totalEvalCount === 0) { + if (totalEvalCount === 0 && tsFiles.length === 0) { // When using --retry-errors, all tests being filtered means no errors or missing cases remain if (options.retryErrors && retryNonErrorResults && retryNonErrorResults.length > 0) { console.log('No execution errors or missing cases in the previous run. Nothing to retry.'); @@ -1355,7 +1345,7 @@ export async function runEvalCommand( } } - // Use only files that survived tag filtering (fileMetadata keys) + // Use only files that survived tag filtering (fileMetadata keys) — TS files are processed separately above const activeTestFiles = resolvedTestFiles.filter((f) => fileMetadata.has(f)); // --transcript: create a shared TranscriptProvider and validate entry count @@ -1387,6 +1377,53 @@ export async function runEvalCommand( // This matches industry practice (promptfoo, deepeval, OpenAI Evals) and avoids cross-file // workspace races without any grouping complexity. try { + // Process TypeScript eval files through evaluate() with CLI overrides. + // Results flow through the same output/artifact pipeline as YAML evals. + // Note: TS eval files don't carry tags; they're skipped when --tag/--exclude-tag is active. + const tsFilesToRun = hasTagFilters + ? (() => { + if (tsFiles.length > 0 && options.verbose) { + console.log( + `Skipped ${tsFiles.length} TS eval file(s) — tag filters don't apply to *.eval.ts files.`, + ); + } + return [] as string[]; + })() + : tsFiles; + + for (const tsFile of tsFilesToRun) { + await ensureFileExists(tsFile, 'TypeScript eval file'); + const { config: tsConfig } = await loadTsEvalFile(tsFile); + + const cliOverrides: Record = {}; + if (options.workers !== undefined) cliOverrides.workers = options.workers; + if (options.filter) cliOverrides.filter = options.filter; + if (resolvedThreshold !== undefined) cliOverrides.threshold = resolvedThreshold; + if (options.cache !== undefined) cliOverrides.cache = options.cache; + if (options.verbose !== undefined) cliOverrides.verbose = options.verbose; + if (options.maxRetries !== 2) cliOverrides.maxRetries = options.maxRetries; + if (options.agentTimeoutSeconds !== undefined) { + cliOverrides.agentTimeoutMs = options.agentTimeoutSeconds * 1000; + } + + console.log(`Running TS eval: ${path.relative(cwd, tsFile)}`); + + const evalResult = await evaluate({ + ...tsConfig, + ...cliOverrides, + onResult: (result: EvaluationResult) => { + outputWriter.append(result); + tsConfig.onResult?.(result); + }, + }); + + allResults.push(...evalResult.results); + remoteEvalSummaries.push({ + evalFile: path.relative(cwd, tsFile), + results: [...evalResult.results], + }); + } + for (const testFilePath of activeTestFiles) { const targetPrep = fileMetadata.get(testFilePath); if (!targetPrep) { diff --git a/apps/cli/src/commands/eval/shared.ts b/apps/cli/src/commands/eval/shared.ts index 55decf920..7570a2e92 100644 --- a/apps/cli/src/commands/eval/shared.ts +++ b/apps/cli/src/commands/eval/shared.ts @@ -34,13 +34,16 @@ export async function resolveEvalPaths(evalPaths: string[], cwd: string): Promis : path.resolve(cwd, pattern); try { const stats = await stat(candidatePath); - if (stats.isFile() && /\.(ya?ml|jsonl|json)$/i.test(candidatePath)) { + if (stats.isFile() && /\.(ya?ml|jsonl|json|ts)$/i.test(candidatePath)) { results.add(candidatePath); continue; } if (stats.isDirectory()) { // Auto-expand directory to recursive eval file glob - const dirGlob = path.posix.join(candidatePath.replace(/\\/g, '/'), '**/*.eval.{yaml,yml}'); + const dirGlob = path.posix.join( + candidatePath.replace(/\\/g, '/'), + '**/*.eval.{yaml,yml,ts}', + ); const dirMatches = await fg(dirGlob, { absolute: true, onlyFiles: true, @@ -69,7 +72,7 @@ export async function resolveEvalPaths(evalPaths: string[], cwd: string): Promis ignore: ignorePatterns, }); - const yamlMatches = matches.filter((filePath) => /\.(ya?ml|jsonl|json)$/i.test(filePath)); + const yamlMatches = matches.filter((filePath) => /\.(ya?ml|jsonl|json|ts)$/i.test(filePath)); for (const filePath of yamlMatches) { results.add(path.normalize(filePath)); } @@ -94,7 +97,7 @@ export async function resolveEvalPaths(evalPaths: string[], cwd: string): Promis throw new Error( `No eval files matched any provided paths or globs: ${includePatterns.join( ', ', - )}. Provide YAML, JSONL, or JSON paths or globs (e.g., "evals/**/*.yaml", "evals/**/*.jsonl", "evals.json").`, + )}. Provide YAML, JSONL, JSON, or TypeScript paths or globs (e.g., "evals/**/*.yaml", "evals/**/*.eval.ts").`, ); } diff --git a/apps/cli/test/commands/eval/shared.test.ts b/apps/cli/test/commands/eval/shared.test.ts index 4eb8c07ff..ed0e5e494 100644 --- a/apps/cli/test/commands/eval/shared.test.ts +++ b/apps/cli/test/commands/eval/shared.test.ts @@ -64,4 +64,41 @@ describe('resolveEvalPaths', () => { resolveEvalPaths(['evals/**/*.eval.yaml', 'evals/**/eval.yaml'], tempDir), ).rejects.toThrow('No eval files matched any provided paths or globs'); }); + + it('discovers *.eval.ts files from directory auto-expansion', async () => { + const evalDir = path.join(tempDir, 'evals'); + mkdirSync(evalDir, { recursive: true }); + + const tsFile = path.join(evalDir, 'greeting.eval.ts'); + writeFileSync(tsFile, 'export default { tests: [] }'); + + const resolved = await resolveEvalPaths([tempDir], tempDir); + + expect(resolved).toEqual([path.normalize(tsFile)]); + }); + + it('accepts a direct .ts file path', async () => { + const tsFile = path.join(tempDir, 'custom.eval.ts'); + writeFileSync(tsFile, 'export default { tests: [] }'); + + const resolved = await resolveEvalPaths([tsFile], tempDir); + + expect(resolved).toEqual([path.normalize(tsFile)]); + }); + + it('discovers both .yaml and .ts files from directory', async () => { + const evalDir = path.join(tempDir, 'evals'); + mkdirSync(evalDir, { recursive: true }); + + const yamlFile = path.join(evalDir, 'suite.eval.yaml'); + const tsFile = path.join(evalDir, 'suite.eval.ts'); + writeFileSync(yamlFile, 'tests:\n - id: sample\n input: test\n'); + writeFileSync(tsFile, 'export default { tests: [] }'); + + const resolved = await resolveEvalPaths([tempDir], tempDir); + + expect(resolved).toContain(path.normalize(yamlFile)); + expect(resolved).toContain(path.normalize(tsFile)); + expect(resolved).toHaveLength(2); + }); }); diff --git a/packages/core/src/evaluation/loaders/config-loader.ts b/packages/core/src/evaluation/loaders/config-loader.ts index 7aede85f9..377222123 100644 --- a/packages/core/src/evaluation/loaders/config-loader.ts +++ b/packages/core/src/evaluation/loaders/config-loader.ts @@ -21,6 +21,7 @@ const ANSI_RESET = '\u001b[0m'; export const DEFAULT_EVAL_PATTERNS: readonly string[] = [ '**/evals/**/*.eval.yaml', '**/evals/**/eval.yaml', + '**/evals/**/*.eval.ts', ]; export type ExecutionDefaults = { diff --git a/packages/core/src/evaluation/loaders/jsonl-parser.ts b/packages/core/src/evaluation/loaders/jsonl-parser.ts index 0887048fe..288d0bd22 100644 --- a/packages/core/src/evaluation/loaders/jsonl-parser.ts +++ b/packages/core/src/evaluation/loaders/jsonl-parser.ts @@ -62,13 +62,16 @@ type RawJsonlEvalCase = JsonObject & { /** * Detect file format by extension. */ -export function detectFormat(filePath: string): 'yaml' | 'jsonl' | 'agent-skills-json' { +export function detectFormat( + filePath: string, +): 'yaml' | 'jsonl' | 'agent-skills-json' | 'typescript' { const ext = path.extname(filePath).toLowerCase(); if (ext === '.jsonl') return 'jsonl'; if (ext === '.yaml' || ext === '.yml') return 'yaml'; if (ext === '.json') return 'agent-skills-json'; + if (ext === '.ts' || ext === '.mts') return 'typescript'; throw new Error( - `Unsupported file format: '${ext}'. Supported formats: .yaml, .yml, .jsonl, .json`, + `Unsupported file format: '${ext}'. Supported formats: .yaml, .yml, .jsonl, .json, .ts`, ); } diff --git a/packages/core/src/evaluation/loaders/ts-eval-loader.ts b/packages/core/src/evaluation/loaders/ts-eval-loader.ts new file mode 100644 index 000000000..eb4946e3b --- /dev/null +++ b/packages/core/src/evaluation/loaders/ts-eval-loader.ts @@ -0,0 +1,58 @@ +/** + * Loads an eval suite from a TypeScript *.eval.ts file. + * + * Each TS eval file must export an EvalConfig as its default export or + * as a named export called `config` or `evalConfig`. + * + * The file is loaded via dynamic import() which works natively in Bun + * and requires tsx/jiti for Node.js. + * + * To add a new export convention: add the name to EXPORT_NAMES below. + */ +import path from 'node:path'; +import { pathToFileURL } from 'node:url'; +import type { EvalConfig } from '../evaluate.js'; + +const EXPORT_NAMES = ['default', 'config', 'evalConfig'] as const; + +export interface TsEvalResult { + readonly config: EvalConfig; + readonly filePath: string; +} + +/** + * Import a *.eval.ts file and extract the EvalConfig export. + * Tries default, `config`, and `evalConfig` named exports in priority order. + */ +export async function loadTsEvalFile(filePath: string): Promise { + const absolutePath = path.resolve(filePath); + const moduleUrl = pathToFileURL(absolutePath).href; + const module = await import(moduleUrl); + + let config: EvalConfig | undefined; + for (const name of EXPORT_NAMES) { + const candidate = module[name]; + if (isEvalConfigLike(candidate)) { + config = candidate; + break; + } + } + + if (!config) { + throw new Error( + `${filePath}: no EvalConfig export found. Export an EvalConfig as default, 'config', or 'evalConfig'.`, + ); + } + + return { config, filePath: absolutePath }; +} + +/** + * Duck-type check for EvalConfig-like objects. + * An EvalConfig must have at least one of: tests, specFile, or target. + */ +function isEvalConfigLike(value: unknown): value is EvalConfig { + if (!value || typeof value !== 'object') return false; + const obj = value as Record; + return 'tests' in obj || 'specFile' in obj || 'target' in obj || 'task' in obj; +} diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index 2719b941c..e809da5ea 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -11,6 +11,10 @@ export { type AgentVConfig as AgentVYamlConfig, type ResultsExportConfig, } from './evaluation/loaders/config-loader.js'; +export { + loadTsEvalFile, + type TsEvalResult, +} from './evaluation/loaders/ts-eval-loader.js'; export { transpileEvalYaml, transpileEvalYamlFile, diff --git a/packages/core/test/evaluation/loaders/fixtures/default-export.eval.ts b/packages/core/test/evaluation/loaders/fixtures/default-export.eval.ts new file mode 100644 index 000000000..13c449055 --- /dev/null +++ b/packages/core/test/evaluation/loaders/fixtures/default-export.eval.ts @@ -0,0 +1,14 @@ +import type { EvalConfig } from '../../../../src/evaluation/evaluate.js'; + +const config: EvalConfig = { + tests: [ + { + id: 'greeting', + input: 'Say hello', + assert: [{ type: 'contains', value: 'hello' }], + }, + ], + target: { provider: 'mock_agent' }, +}; + +export default config; diff --git a/packages/core/test/evaluation/loaders/fixtures/eval-config-named.eval.ts b/packages/core/test/evaluation/loaders/fixtures/eval-config-named.eval.ts new file mode 100644 index 000000000..2c74e72e0 --- /dev/null +++ b/packages/core/test/evaluation/loaders/fixtures/eval-config-named.eval.ts @@ -0,0 +1,12 @@ +import type { EvalConfig } from '../../../../src/evaluation/evaluate.js'; + +export const evalConfig: EvalConfig = { + tests: [ + { + id: 'eval-config-named', + input: 'Say hello', + assert: [{ type: 'contains', value: 'hello' }], + }, + ], + target: { provider: 'mock_agent' }, +}; diff --git a/packages/core/test/evaluation/loaders/fixtures/named-config.eval.ts b/packages/core/test/evaluation/loaders/fixtures/named-config.eval.ts new file mode 100644 index 000000000..8dfb9f81c --- /dev/null +++ b/packages/core/test/evaluation/loaders/fixtures/named-config.eval.ts @@ -0,0 +1,12 @@ +import type { EvalConfig } from '../../../../src/evaluation/evaluate.js'; + +export const config: EvalConfig = { + tests: [ + { + id: 'named-config', + input: 'Say hello', + assert: [{ type: 'contains', value: 'hello' }], + }, + ], + target: { provider: 'mock_agent' }, +}; diff --git a/packages/core/test/evaluation/loaders/fixtures/no-config.eval.ts b/packages/core/test/evaluation/loaders/fixtures/no-config.eval.ts new file mode 100644 index 000000000..ee9eb65c0 --- /dev/null +++ b/packages/core/test/evaluation/loaders/fixtures/no-config.eval.ts @@ -0,0 +1,2 @@ +// This file has no EvalConfig export — should cause loadTsEvalFile to throw. +export const greeting = 'hello'; diff --git a/packages/core/test/evaluation/loaders/jsonl-parser.test.ts b/packages/core/test/evaluation/loaders/jsonl-parser.test.ts index 1285d91b5..deb73ae68 100644 --- a/packages/core/test/evaluation/loaders/jsonl-parser.test.ts +++ b/packages/core/test/evaluation/loaders/jsonl-parser.test.ts @@ -27,6 +27,15 @@ describe('detectFormat', () => { expect(detectFormat('/path/to/evals.json')).toBe('agent-skills-json'); }); + it('returns typescript for .ts extension', () => { + expect(detectFormat('greeting.eval.ts')).toBe('typescript'); + expect(detectFormat('/path/to/eval.ts')).toBe('typescript'); + }); + + it('returns typescript for .mts extension', () => { + expect(detectFormat('greeting.eval.mts')).toBe('typescript'); + }); + it('throws for unsupported extensions', () => { expect(() => detectFormat('test.txt')).toThrow('Unsupported file format'); expect(() => detectFormat('test')).toThrow('Unsupported file format'); diff --git a/packages/core/test/evaluation/loaders/ts-eval-loader.test.ts b/packages/core/test/evaluation/loaders/ts-eval-loader.test.ts new file mode 100644 index 000000000..0322cd495 --- /dev/null +++ b/packages/core/test/evaluation/loaders/ts-eval-loader.test.ts @@ -0,0 +1,39 @@ +import { describe, expect, it } from 'bun:test'; +import path from 'node:path'; + +import { loadTsEvalFile } from '../../../src/evaluation/loaders/ts-eval-loader.js'; + +const fixtureDir = path.join(import.meta.dir, 'fixtures'); + +describe('loadTsEvalFile', () => { + it('loads default export', async () => { + const result = await loadTsEvalFile(path.join(fixtureDir, 'default-export.eval.ts')); + expect(result.config).toBeDefined(); + expect(result.config.tests).toHaveLength(1); + expect(result.config.tests?.[0].id).toBe('greeting'); + }); + + it('loads named "config" export', async () => { + const result = await loadTsEvalFile(path.join(fixtureDir, 'named-config.eval.ts')); + expect(result.config).toBeDefined(); + expect(result.config.tests?.[0].id).toBe('named-config'); + }); + + it('loads named "evalConfig" export', async () => { + const result = await loadTsEvalFile(path.join(fixtureDir, 'eval-config-named.eval.ts')); + expect(result.config).toBeDefined(); + expect(result.config.tests?.[0].id).toBe('eval-config-named'); + }); + + it('throws when no EvalConfig export found', async () => { + await expect(loadTsEvalFile(path.join(fixtureDir, 'no-config.eval.ts'))).rejects.toThrow( + 'no EvalConfig export found', + ); + }); + + it('returns absolute file path', async () => { + const result = await loadTsEvalFile(path.join(fixtureDir, 'default-export.eval.ts')); + expect(path.isAbsolute(result.filePath)).toBe(true); + expect(result.filePath).toContain('default-export.eval.ts'); + }); +}); From 7864abe2f94a180e81596bdf66507b9a43caf137 Mon Sep 17 00:00:00 2001 From: Christopher Date: Thu, 16 Apr 2026 03:27:27 +0000 Subject: [PATCH 2/2] refactor(cli): unify TypeScript eval loading with suite pipeline (#1116) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- apps/cli/src/commands/eval/run-eval.ts | 123 ++++---- apps/cli/src/commands/eval/shared.ts | 10 +- apps/cli/test/commands/eval/shared.test.ts | 14 +- packages/core/src/evaluation/evaluate.ts | 284 ++++++++++++------ .../src/evaluation/loaders/ts-eval-loader.ts | 48 ++- packages/core/src/evaluation/yaml-parser.ts | 13 + .../evaluate-programmatic-api.test.ts | 12 + .../loaders/fixtures/default-export.eval.ts | 10 +- .../evaluation/loaders/ts-eval-loader.test.ts | 25 ++ 9 files changed, 384 insertions(+), 155 deletions(-) diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index 5e84d40df..a8d0a73e3 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -17,11 +17,10 @@ import { runEvaluation as defaultRunEvaluation, deriveCategory, ensureVSCodeSubagents, - evaluate, loadConfig, loadTestSuite, loadTsConfig, - loadTsEvalFile, + resolveTargetDefinition, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToCodexLogEntries, @@ -533,6 +532,9 @@ async function prepareFileMetadata(params: { readonly failOnError?: FailOnError; readonly threshold?: number; readonly tags?: readonly string[]; + readonly providerFactory?: ( + target: import('@agentv/core').ResolvedTarget, + ) => import('@agentv/core').Provider; }> { const { testFilePath, repoRoot, cwd, options } = params; @@ -576,6 +578,54 @@ async function prepareFileMetadata(params: { inlineTargetLabel: `transcript (${path.basename(options.transcript)})`, }, ]; + } else if (suite.inlineTarget && options.cliTargets.length === 0) { + const targetDefinition = suite.inlineTarget; + const resolvedTarget = options.dryRun + ? ({ + kind: 'mock', + name: `${targetDefinition.name}-dry-run`, + graderTarget: undefined, + config: { + response: '{"answer":"Mock dry-run response"}', + delayMs: options.dryRunDelay, + delayMinMs: options.dryRunDelayMin, + delayMaxMs: options.dryRunDelayMax, + }, + } satisfies ResolvedTarget) + : resolveTargetDefinition(targetDefinition, process.env, testFilePath, { + emitDeprecationWarnings: false, + }); + selections = [ + { + selection: { + definitions: [targetDefinition], + resolvedTarget, + targetName: targetDefinition.name, + targetSource: 'test-file', + targetsFilePath: testFilePath, + }, + inlineTargetLabel: resolveTargetLabel(targetDefinition.name, resolvedTarget.name), + }, + ]; + } else if (suite.providerFactory && options.cliTargets.length === 0) { + const taskTarget: ResolvedTarget = { + kind: 'mock', + name: 'custom-task', + graderTarget: undefined, + config: {}, + }; + selections = [ + { + selection: { + definitions: [], + resolvedTarget: taskTarget, + targetName: 'custom-task', + targetSource: 'test-file', + targetsFilePath: testFilePath, + }, + inlineTargetLabel: 'custom-task', + }, + ]; } else { // Determine target names: CLI --target flags override YAML const cliTargets = options.cliTargets; @@ -660,6 +710,7 @@ async function prepareFileMetadata(params: { failOnError: suite.failOnError, threshold: suite.threshold, tags: suite.metadata?.tags, + providerFactory: suite.providerFactory, }; } @@ -1172,21 +1223,12 @@ export async function runEvalCommand( readonly failOnError?: FailOnError; readonly threshold?: number; readonly tags?: readonly string[]; + readonly providerFactory?: ( + target: import('@agentv/core').ResolvedTarget, + ) => import('@agentv/core').Provider; } >(); - // Separate TypeScript eval files from YAML/JSONL files. - // TS files export an EvalConfig and run through evaluate(). - const tsFiles: string[] = []; - const yamlFiles: string[] = []; for (const testFilePath of resolvedTestFiles) { - if (/\.(ts|mts)$/.test(testFilePath)) { - tsFiles.push(testFilePath); - } else { - yamlFiles.push(testFilePath); - } - } - - for (const testFilePath of yamlFiles) { const meta = await prepareFileMetadata({ testFilePath, repoRoot, @@ -1277,7 +1319,7 @@ export async function runEvalCommand( } } - if (totalEvalCount === 0 && tsFiles.length === 0) { + if (totalEvalCount === 0) { // When using --retry-errors, all tests being filtered means no errors or missing cases remain if (options.retryErrors && retryNonErrorResults && retryNonErrorResults.length > 0) { console.log('No execution errors or missing cases in the previous run. Nothing to retry.'); @@ -1345,7 +1387,7 @@ export async function runEvalCommand( } } - // Use only files that survived tag filtering (fileMetadata keys) — TS files are processed separately above + // Use only files that survived tag filtering. const activeTestFiles = resolvedTestFiles.filter((f) => fileMetadata.has(f)); // --transcript: create a shared TranscriptProvider and validate entry count @@ -1377,53 +1419,6 @@ export async function runEvalCommand( // This matches industry practice (promptfoo, deepeval, OpenAI Evals) and avoids cross-file // workspace races without any grouping complexity. try { - // Process TypeScript eval files through evaluate() with CLI overrides. - // Results flow through the same output/artifact pipeline as YAML evals. - // Note: TS eval files don't carry tags; they're skipped when --tag/--exclude-tag is active. - const tsFilesToRun = hasTagFilters - ? (() => { - if (tsFiles.length > 0 && options.verbose) { - console.log( - `Skipped ${tsFiles.length} TS eval file(s) — tag filters don't apply to *.eval.ts files.`, - ); - } - return [] as string[]; - })() - : tsFiles; - - for (const tsFile of tsFilesToRun) { - await ensureFileExists(tsFile, 'TypeScript eval file'); - const { config: tsConfig } = await loadTsEvalFile(tsFile); - - const cliOverrides: Record = {}; - if (options.workers !== undefined) cliOverrides.workers = options.workers; - if (options.filter) cliOverrides.filter = options.filter; - if (resolvedThreshold !== undefined) cliOverrides.threshold = resolvedThreshold; - if (options.cache !== undefined) cliOverrides.cache = options.cache; - if (options.verbose !== undefined) cliOverrides.verbose = options.verbose; - if (options.maxRetries !== 2) cliOverrides.maxRetries = options.maxRetries; - if (options.agentTimeoutSeconds !== undefined) { - cliOverrides.agentTimeoutMs = options.agentTimeoutSeconds * 1000; - } - - console.log(`Running TS eval: ${path.relative(cwd, tsFile)}`); - - const evalResult = await evaluate({ - ...tsConfig, - ...cliOverrides, - onResult: (result: EvaluationResult) => { - outputWriter.append(result); - tsConfig.onResult?.(result); - }, - }); - - allResults.push(...evalResult.results); - remoteEvalSummaries.push({ - evalFile: path.relative(cwd, tsFile), - results: [...evalResult.results], - }); - } - for (const testFilePath of activeTestFiles) { const targetPrep = fileMetadata.get(testFilePath); if (!targetPrep) { @@ -1479,7 +1474,7 @@ export async function runEvalCommand( budgetUsd: targetPrep.budgetUsd, failOnError: targetPrep.failOnError, threshold: resolvedThreshold, - providerFactory: transcriptProviderFactory, + providerFactory: transcriptProviderFactory ?? targetPrep.providerFactory, }); const evalFile = path.relative(cwd, testFilePath); const existingSummary = remoteEvalSummaries.find( diff --git a/apps/cli/src/commands/eval/shared.ts b/apps/cli/src/commands/eval/shared.ts index 7570a2e92..3e1c7fc3d 100644 --- a/apps/cli/src/commands/eval/shared.ts +++ b/apps/cli/src/commands/eval/shared.ts @@ -34,7 +34,7 @@ export async function resolveEvalPaths(evalPaths: string[], cwd: string): Promis : path.resolve(cwd, pattern); try { const stats = await stat(candidatePath); - if (stats.isFile() && /\.(ya?ml|jsonl|json|ts)$/i.test(candidatePath)) { + if (stats.isFile() && /\.(ya?ml|jsonl|json|[cm]?ts)$/i.test(candidatePath)) { results.add(candidatePath); continue; } @@ -42,7 +42,7 @@ export async function resolveEvalPaths(evalPaths: string[], cwd: string): Promis // Auto-expand directory to recursive eval file glob const dirGlob = path.posix.join( candidatePath.replace(/\\/g, '/'), - '**/*.eval.{yaml,yml,ts}', + '**/{*.eval.yaml,*.eval.yml,eval.yaml,eval.yml,*.eval.ts,*.eval.mts}', ); const dirMatches = await fg(dirGlob, { absolute: true, @@ -72,7 +72,9 @@ export async function resolveEvalPaths(evalPaths: string[], cwd: string): Promis ignore: ignorePatterns, }); - const yamlMatches = matches.filter((filePath) => /\.(ya?ml|jsonl|json|ts)$/i.test(filePath)); + const yamlMatches = matches.filter((filePath) => + /\.(ya?ml|jsonl|json|[cm]?ts)$/i.test(filePath), + ); for (const filePath of yamlMatches) { results.add(path.normalize(filePath)); } @@ -97,7 +99,7 @@ export async function resolveEvalPaths(evalPaths: string[], cwd: string): Promis throw new Error( `No eval files matched any provided paths or globs: ${includePatterns.join( ', ', - )}. Provide YAML, JSONL, JSON, or TypeScript paths or globs (e.g., "evals/**/*.yaml", "evals/**/*.eval.ts").`, + )}. Provide YAML, JSONL, JSON, or TypeScript paths or globs (e.g., "evals/**/eval.yaml", "evals/**/*.eval.ts").`, ); } diff --git a/apps/cli/test/commands/eval/shared.test.ts b/apps/cli/test/commands/eval/shared.test.ts index ed0e5e494..52a20ce4a 100644 --- a/apps/cli/test/commands/eval/shared.test.ts +++ b/apps/cli/test/commands/eval/shared.test.ts @@ -77,6 +77,15 @@ describe('resolveEvalPaths', () => { expect(resolved).toEqual([path.normalize(tsFile)]); }); + it('accepts a direct .mts file path', async () => { + const tsFile = path.join(tempDir, 'custom.eval.mts'); + writeFileSync(tsFile, 'export default { tests: [] }'); + + const resolved = await resolveEvalPaths([tsFile], tempDir); + + expect(resolved).toEqual([path.normalize(tsFile)]); + }); + it('accepts a direct .ts file path', async () => { const tsFile = path.join(tempDir, 'custom.eval.ts'); writeFileSync(tsFile, 'export default { tests: [] }'); @@ -91,14 +100,17 @@ describe('resolveEvalPaths', () => { mkdirSync(evalDir, { recursive: true }); const yamlFile = path.join(evalDir, 'suite.eval.yaml'); + const evalYamlFile = path.join(evalDir, 'eval.yaml'); const tsFile = path.join(evalDir, 'suite.eval.ts'); writeFileSync(yamlFile, 'tests:\n - id: sample\n input: test\n'); + writeFileSync(evalYamlFile, 'tests:\n - id: sample2\n input: test\n'); writeFileSync(tsFile, 'export default { tests: [] }'); const resolved = await resolveEvalPaths([tempDir], tempDir); expect(resolved).toContain(path.normalize(yamlFile)); + expect(resolved).toContain(path.normalize(evalYamlFile)); expect(resolved).toContain(path.normalize(tsFile)); - expect(resolved).toHaveLength(2); + expect(resolved).toHaveLength(3); }); }); diff --git a/packages/core/src/evaluation/evaluate.ts b/packages/core/src/evaluation/evaluate.ts index ff59670d3..328930b77 100644 --- a/packages/core/src/evaluation/evaluate.ts +++ b/packages/core/src/evaluation/evaluate.ts @@ -58,12 +58,15 @@ import { existsSync } from 'node:fs'; import path from 'node:path'; +import micromatch from 'micromatch'; import { buildDirectoryChain, findGitRoot } from './file-utils.js'; import type { AssertFn } from './assertions.js'; import { DEFAULT_THRESHOLD } from './graders/scoring.js'; +import type { EvalMetadata } from './metadata.js'; import { runEvaluation } from './orchestrator.js'; import { createFunctionProvider } from './providers/function-provider.js'; +import type { ProviderFactoryFn } from './providers/provider-registry.js'; import { readTargetDefinitions } from './providers/targets-file.js'; import { type ResolvedTarget, resolveTargetDefinition } from './providers/targets.js'; import type { TargetDefinition } from './providers/types.js'; @@ -77,7 +80,7 @@ import type { InlineAssertEvaluatorConfig, WorkspaceHookConfig, } from './types.js'; -import { loadTests } from './yaml-parser.js'; +import { loadTestSuite } from './yaml-parser.js'; /** * Inline test definition for the programmatic API. @@ -170,6 +173,8 @@ export interface EvalConfig { readonly task?: (input: string) => string | Promise; /** Suite-level assertions applied to all tests */ readonly assert?: readonly AssertEntry[]; + /** Optional suite metadata used by CLI discovery, tagging, and reporting. */ + readonly metadata?: EvalMetadata; /** Filter tests by ID pattern(s) (glob supported). Arrays use OR logic. */ readonly filter?: string | readonly string[]; /** Maximum concurrent workers (default: 3) */ @@ -192,6 +197,19 @@ export interface EvalConfig { readonly budgetUsd?: number; } +export interface MaterializedEvalConfig { + readonly testFilePath: string; + readonly tests: readonly EvalTest[]; + readonly workers?: number; + readonly cache?: boolean; + readonly budgetUsd?: number; + readonly threshold?: number; + readonly metadata?: EvalMetadata; + readonly target?: TargetDefinition; + readonly task?: (input: string) => string | Promise; + readonly providerFactory?: ProviderFactoryFn; +} + /** * Summary statistics for an evaluation run. */ @@ -269,19 +287,22 @@ export async function evaluate(config: EvalConfig): Promise { const gitRoot = await findGitRoot(process.cwd()); const repoRoot = gitRoot ?? process.cwd(); - const testFilePath = config.specFile - ? path.resolve(config.specFile) - : path.join(process.cwd(), '__programmatic__.yaml'); + const materialized = await materializeEvalConfig(config, { + repoRoot, + baseDir: process.cwd(), + }); + const testFilePath = materialized.testFilePath; // Load .env files from the eval file hierarchy so nested eval-local .env // files participate even when the command is launched from a parent folder. await loadEnvHierarchy(repoRoot, testFilePath); let resolvedTarget: ResolvedTarget; - let taskProvider: ReturnType | undefined; - if (config.task) { - // Wrap task function as a Provider - taskProvider = createFunctionProvider(config.task); + let providerFactory: ProviderFactoryFn | undefined; + if (config.task || materialized.providerFactory) { + providerFactory = config.task + ? () => createFunctionProvider(config.task as (input: string) => string | Promise) + : materialized.providerFactory; resolvedTarget = { kind: 'mock', name: 'custom-task', @@ -292,100 +313,29 @@ export async function evaluate(config: EvalConfig): Promise { let targetDef: TargetDefinition; if (config.target) { targetDef = config.target; + } else if (materialized.target) { + targetDef = materialized.target; } else { targetDef = (await discoverDefaultTarget(repoRoot)) ?? { name: 'default', provider: 'mock' }; } resolvedTarget = resolveTargetDefinition(targetDef); } - let evalCases: readonly EvalTest[] | EvalTest[]; - - if (config.specFile) { - // File-based mode: load from YAML - evalCases = await loadTests(testFilePath, repoRoot, { - verbose: config.verbose, - filter: config.filter, - }); - } else { - // Build workspace config with before_all hook if beforeAll is provided - const suiteWorkspace = config.beforeAll - ? { hooks: { before_all: toBeforeAllHook(config.beforeAll) } } - : undefined; - - // Inline mode: convert EvalTestInput[] to EvalTest[] - evalCases = (config.tests ?? []).map((test): EvalTest => { - // Conversation mode: use turns[] for input/question derivation - const isConversation = test.mode === 'conversation' || (test.turns && test.turns.length > 0); - - if (!isConversation && !test.input) { - throw new Error(`Test '${test.id}': input is required for non-conversation tests`); - } - - const input = isConversation - ? toMessageArray(test.turns?.[0]?.input ?? '') - : toMessageArray(test.input ?? ''); - - const question = isConversation - ? extractQuestion(test.turns?.[0]?.input ?? '') - : extractQuestion(test.input ?? ''); - - const expectedOutputValue = test.expectedOutput ?? test.expected_output; - const expectedOutput = expectedOutputValue - ? ([ - { role: 'assistant' as const, content: expectedOutputValue }, - ] as EvalTest['expected_output']) - : []; - - // Convert inline assertions to evaluator config format - const allAssertions = [...(test.assert ?? []), ...(config.assert ?? [])]; - const assertConfigs = convertAssertions(allAssertions); - - // Convert conversation turns if present — keep input/expected_output as - // TestMessageContent (matching YAML parser behavior), not wrapped in message arrays. - const turns: ConversationTurn[] | undefined = test.turns?.map((turn) => { - const turnExpected = turn.expectedOutput ?? turn.expected_output; - return { - input: turn.input as ConversationTurn['input'], - ...(turnExpected !== undefined && { - expected_output: turnExpected as ConversationTurn['expected_output'], - }), - assertions: turn.assert ? convertAssertions([...turn.assert]) : undefined, - }; - }); - - return { - id: test.id, - criteria: test.criteria ?? '', - question: String(question), - input, - expected_output: expectedOutput, - reference_answer: expectedOutputValue, - file_paths: [], - assertions: assertConfigs.length > 0 ? assertConfigs : undefined, - metadata: test.metadata, - ...(suiteWorkspace && { workspace: suiteWorkspace }), - ...(isConversation && { mode: 'conversation' as const }), - ...(turns && { turns }), - ...(test.aggregation && { aggregation: test.aggregation }), - }; - }); - } - const collectedResults: EvaluationResult[] = []; const results = await runEvaluation({ testFilePath, repoRoot, target: resolvedTarget, - ...(taskProvider ? { providerFactory: () => taskProvider } : {}), + ...(providerFactory ? { providerFactory } : {}), maxRetries: config.maxRetries ?? 2, agentTimeoutMs: config.agentTimeoutMs, verbose: config.verbose, maxConcurrency: config.workers ?? 3, filter: config.filter, threshold: config.threshold, - evalCases, - ...(config.budgetUsd !== undefined && { budgetUsd: config.budgetUsd }), + evalCases: materialized.tests, + ...(materialized.budgetUsd !== undefined && { budgetUsd: materialized.budgetUsd }), onResult: async (result) => { collectedResults.push(result); config.onResult?.(result); @@ -401,6 +351,62 @@ export async function evaluate(config: EvalConfig): Promise { }; } +export async function materializeEvalConfig( + config: EvalConfig, + options?: { + readonly repoRoot?: string; + readonly baseDir?: string; + readonly filter?: string | readonly string[]; + readonly category?: string; + }, +): Promise { + const baseDir = options?.baseDir ?? process.cwd(); + const repoRoot = options?.repoRoot ?? (await findGitRoot(baseDir)) ?? baseDir; + const testFilePath = config.specFile + ? path.resolve(baseDir, config.specFile) + : path.join(baseDir, '__programmatic__.yaml'); + const effectiveFilter = options?.filter ?? config.filter; + + if (config.specFile) { + const suite = await loadTestSuite(testFilePath, repoRoot, { + verbose: config.verbose, + filter: effectiveFilter, + category: options?.category, + }); + const tests = applyProgrammaticSuiteOverrides(suite.tests, config); + return { + testFilePath, + tests, + workers: config.workers ?? suite.workers, + cache: config.cache ?? suite.cacheConfig?.enabled, + budgetUsd: config.budgetUsd ?? suite.budgetUsd, + threshold: config.threshold ?? suite.threshold, + metadata: config.metadata ?? suite.metadata, + target: config.target ?? suite.inlineTarget, + task: config.task, + providerFactory: suite.providerFactory, + }; + } + + const tests = buildInlineEvalTests(config, { + filter: effectiveFilter, + category: options?.category, + testFilePath, + }); + + return { + testFilePath, + tests, + workers: config.workers, + cache: config.cache, + budgetUsd: config.budgetUsd, + threshold: config.threshold, + metadata: config.metadata, + target: config.target, + task: config.task, + }; +} + /** * Convert a flexible input (string or message array) to the internal TestMessage[] format. */ @@ -454,6 +460,116 @@ function convertAssertions(entries: readonly AssertEntry[]): GraderConfig[] { }); } +function buildInlineEvalTests( + config: EvalConfig, + options: { + readonly filter?: string | readonly string[]; + readonly category?: string; + readonly testFilePath: string; + }, +): readonly EvalTest[] { + const suiteWorkspace = config.beforeAll + ? { hooks: { before_all: toBeforeAllHook(config.beforeAll) } } + : undefined; + const derivedSuiteName = path + .basename(options.testFilePath) + .replace(/\.eval\.[cm]?ts$/i, '') + .replace(/\.[cm]?ts$/i, ''); + const suiteName = config.metadata?.name ?? (derivedSuiteName || 'eval'); + + return (config.tests ?? []) + .filter((test) => !options.filter || matchesFilter(test.id, options.filter)) + .map((test): EvalTest => { + const isConversation = test.mode === 'conversation' || (test.turns && test.turns.length > 0); + + if (!isConversation && !test.input) { + throw new Error(`Test '${test.id}': input is required for non-conversation tests`); + } + + const input = isConversation + ? toMessageArray(test.turns?.[0]?.input ?? '') + : toMessageArray(test.input ?? ''); + + const question = isConversation + ? extractQuestion(test.turns?.[0]?.input ?? '') + : extractQuestion(test.input ?? ''); + + const expectedOutputValue = test.expectedOutput ?? test.expected_output; + const expectedOutput = expectedOutputValue + ? ([ + { role: 'assistant' as const, content: expectedOutputValue }, + ] as EvalTest['expected_output']) + : []; + + const allAssertions = [...(test.assert ?? []), ...(config.assert ?? [])]; + const assertConfigs = convertAssertions(allAssertions); + const turns: ConversationTurn[] | undefined = test.turns?.map((turn) => { + const turnExpected = turn.expectedOutput ?? turn.expected_output; + return { + input: turn.input as ConversationTurn['input'], + ...(turnExpected !== undefined && { + expected_output: turnExpected as ConversationTurn['expected_output'], + }), + assertions: turn.assert ? convertAssertions([...turn.assert]) : undefined, + }; + }); + + return { + id: test.id, + suite: suiteName, + category: options.category, + criteria: test.criteria ?? '', + question: String(question), + input, + expected_output: expectedOutput, + reference_answer: expectedOutputValue, + file_paths: [], + assertions: assertConfigs.length > 0 ? assertConfigs : undefined, + metadata: test.metadata, + ...(suiteWorkspace && { workspace: suiteWorkspace }), + ...(isConversation && { mode: 'conversation' as const }), + ...(turns && { turns }), + ...(test.aggregation && { aggregation: test.aggregation }), + }; + }); +} + +function applyProgrammaticSuiteOverrides( + tests: readonly EvalTest[], + config: EvalConfig, +): readonly EvalTest[] { + if (!config.beforeAll && (!config.assert || config.assert.length === 0)) { + return tests; + } + + const suiteWorkspace = config.beforeAll + ? { hooks: { before_all: toBeforeAllHook(config.beforeAll) } } + : undefined; + const suiteAssertions = config.assert ? convertAssertions(config.assert) : []; + + return tests.map((test) => ({ + ...test, + ...(suiteAssertions.length > 0 && { + assertions: [...(test.assertions ?? []), ...suiteAssertions], + }), + ...(suiteWorkspace && { + workspace: { + ...test.workspace, + hooks: { + ...test.workspace?.hooks, + ...(test.workspace?.hooks?.before_all ? {} : suiteWorkspace.hooks), + }, + }, + }), + })); +} + +function matchesFilter(id: string, filter: string | readonly string[]): boolean { + return typeof filter === 'string' + ? micromatch.isMatch(id, filter) + : filter.some((pattern) => micromatch.isMatch(id, pattern)); +} + /** * Map user-facing assertion type names to internal grader type names. * Handles snake_case to kebab-case normalization (e.g., 'llm_grader' -> 'llm-grader'). diff --git a/packages/core/src/evaluation/loaders/ts-eval-loader.ts b/packages/core/src/evaluation/loaders/ts-eval-loader.ts index eb4946e3b..d406b90ef 100644 --- a/packages/core/src/evaluation/loaders/ts-eval-loader.ts +++ b/packages/core/src/evaluation/loaders/ts-eval-loader.ts @@ -11,7 +11,11 @@ */ import path from 'node:path'; import { pathToFileURL } from 'node:url'; -import type { EvalConfig } from '../evaluate.js'; +import { type EvalConfig, materializeEvalConfig } from '../evaluate.js'; +import { createFunctionProvider } from '../providers/function-provider.js'; +import type { ProviderFactoryFn } from '../providers/provider-registry.js'; +import type { TargetDefinition } from '../providers/types.js'; +import type { EvalSuiteResult } from '../yaml-parser.js'; const EXPORT_NAMES = ['default', 'config', 'evalConfig'] as const; @@ -20,6 +24,11 @@ export interface TsEvalResult { readonly filePath: string; } +export interface TsEvalSuiteResult extends EvalSuiteResult { + readonly inlineTarget?: TargetDefinition; + readonly providerFactory?: ProviderFactoryFn; +} + /** * Import a *.eval.ts file and extract the EvalConfig export. * Tries default, `config`, and `evalConfig` named exports in priority order. @@ -47,6 +56,43 @@ export async function loadTsEvalFile(filePath: string): Promise { return { config, filePath: absolutePath }; } +export async function loadTsEvalSuite( + filePath: string, + repoRoot: string, + options?: { + readonly verbose?: boolean; + readonly filter?: string | readonly string[]; + readonly category?: string; + }, +): Promise { + const { config, filePath: absolutePath } = await loadTsEvalFile(filePath); + const materialized = await materializeEvalConfig(config, { + repoRoot, + baseDir: path.dirname(absolutePath), + filter: options?.filter, + category: options?.category, + }); + + return { + tests: materialized.tests, + ...(materialized.workers !== undefined && { workers: materialized.workers }), + ...(materialized.cache !== undefined && { cacheConfig: { enabled: materialized.cache } }), + ...(materialized.budgetUsd !== undefined && { budgetUsd: materialized.budgetUsd }), + ...(materialized.threshold !== undefined && { threshold: materialized.threshold }), + ...(materialized.metadata !== undefined && { metadata: materialized.metadata }), + ...(materialized.target !== undefined && { inlineTarget: materialized.target }), + ...(materialized.task !== undefined && { + providerFactory: (() => { + const task = materialized.task; + if (!task) { + throw new Error(`${filePath}: missing task function for providerFactory`); + } + return createFunctionProvider(task); + }) as ProviderFactoryFn, + }), + }; +} + /** * Duck-type check for EvalConfig-like objects. * An EvalConfig must have at least one of: tests, specFile, or target. diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts index 928e73d5e..4d8cf057d 100644 --- a/packages/core/src/evaluation/yaml-parser.ts +++ b/packages/core/src/evaluation/yaml-parser.ts @@ -210,6 +210,10 @@ export type EvalSuiteResult = { readonly threshold?: number; /** Resolved workspace.path from the eval YAML (after env-var expansion), if set */ readonly workspacePath?: string; + /** Inline target definition from a TS eval config. */ + readonly inlineTarget?: import('./providers/types.js').TargetDefinition; + /** Custom provider factory from a TS eval config task(). */ + readonly providerFactory?: import('./providers/provider-registry.js').ProviderFactoryFn; }; /** @@ -228,6 +232,10 @@ export async function loadTestSuite( if (format === 'agent-skills-json') { return { tests: await loadTestsFromAgentSkills(evalFilePath) }; } + if (format === 'typescript') { + const { loadTsEvalSuite } = await import('./loaders/ts-eval-loader.js'); + return loadTsEvalSuite(evalFilePath, resolveToAbsolutePath(repoRoot), options); + } const { tests, parsed, suiteWorkspacePath } = await loadTestsFromYaml( evalFilePath, repoRoot, @@ -267,6 +275,11 @@ export async function loadTests( if (format === 'agent-skills-json') { return loadTestsFromAgentSkills(evalFilePath); } + if (format === 'typescript') { + const { loadTsEvalSuite } = await import('./loaders/ts-eval-loader.js'); + const suite = await loadTsEvalSuite(evalFilePath, resolveToAbsolutePath(repoRoot), options); + return suite.tests; + } const { tests } = await loadTestsFromYaml(evalFilePath, repoRoot, options); return tests; } diff --git a/packages/core/test/evaluation/evaluate-programmatic-api.test.ts b/packages/core/test/evaluation/evaluate-programmatic-api.test.ts index 9a91c9e6d..b8d32524d 100644 --- a/packages/core/test/evaluation/evaluate-programmatic-api.test.ts +++ b/packages/core/test/evaluation/evaluate-programmatic-api.test.ts @@ -6,6 +6,7 @@ */ import { describe, expect, it } from 'bun:test'; +import path from 'node:path'; import { evaluate } from '../../src/evaluation/evaluate.js'; describe('evaluate() — programmatic API extensions', () => { @@ -225,6 +226,17 @@ describe('evaluate() — programmatic API extensions', () => { expect(summary.passed).toBe(1); }); + it('uses inline target from a TypeScript specFile', async () => { + const specFile = path.join(import.meta.dir, 'loaders', 'fixtures', 'default-export.eval.ts'); + + const { summary } = await evaluate({ + specFile, + }); + + expect(summary.total).toBe(1); + expect(summary.passed).toBe(1); + }); + // --------------------------------------------------------------------------- // Validation // --------------------------------------------------------------------------- diff --git a/packages/core/test/evaluation/loaders/fixtures/default-export.eval.ts b/packages/core/test/evaluation/loaders/fixtures/default-export.eval.ts index 13c449055..df6bdcafd 100644 --- a/packages/core/test/evaluation/loaders/fixtures/default-export.eval.ts +++ b/packages/core/test/evaluation/loaders/fixtures/default-export.eval.ts @@ -1,6 +1,10 @@ import type { EvalConfig } from '../../../../src/evaluation/evaluate.js'; const config: EvalConfig = { + metadata: { + name: 'default-export-suite', + tags: ['sdk', 'typescript'], + }, tests: [ { id: 'greeting', @@ -8,7 +12,11 @@ const config: EvalConfig = { assert: [{ type: 'contains', value: 'hello' }], }, ], - target: { provider: 'mock_agent' }, + workers: 2, + cache: false, + budgetUsd: 1.5, + threshold: 0.9, + target: { name: 'inline-target', provider: 'mock', response: 'hello there' }, }; export default config; diff --git a/packages/core/test/evaluation/loaders/ts-eval-loader.test.ts b/packages/core/test/evaluation/loaders/ts-eval-loader.test.ts index 0322cd495..8abfc74bd 100644 --- a/packages/core/test/evaluation/loaders/ts-eval-loader.test.ts +++ b/packages/core/test/evaluation/loaders/ts-eval-loader.test.ts @@ -2,6 +2,7 @@ import { describe, expect, it } from 'bun:test'; import path from 'node:path'; import { loadTsEvalFile } from '../../../src/evaluation/loaders/ts-eval-loader.js'; +import { loadTestSuite, loadTests } from '../../../src/evaluation/yaml-parser.js'; const fixtureDir = path.join(import.meta.dir, 'fixtures'); @@ -36,4 +37,28 @@ describe('loadTsEvalFile', () => { expect(path.isAbsolute(result.filePath)).toBe(true); expect(result.filePath).toContain('default-export.eval.ts'); }); + + it('materializes a TS eval through loadTestSuite', async () => { + const suite = await loadTestSuite(path.join(fixtureDir, 'default-export.eval.ts'), fixtureDir, { + category: 'sdk', + }); + expect(suite.tests).toHaveLength(1); + expect(suite.tests[0].suite).toBe('default-export-suite'); + expect(suite.tests[0].category).toBe('sdk'); + expect(suite.metadata?.tags).toEqual(['sdk', 'typescript']); + expect(suite.workers).toBe(2); + expect(suite.cacheConfig?.enabled).toBe(false); + expect(suite.budgetUsd).toBe(1.5); + expect(suite.threshold).toBe(0.9); + expect(suite.inlineTarget?.name).toBe('inline-target'); + }); + + it('routes TypeScript evals through loadTests', async () => { + const tests = await loadTests(path.join(fixtureDir, 'default-export.eval.ts'), fixtureDir, { + category: 'sdk', + }); + expect(tests).toHaveLength(1); + expect(tests[0].id).toBe('greeting'); + expect(tests[0].category).toBe('sdk'); + }); });