Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion apps/cli/src/commands/eval/commands/run.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ export const evalRunCommand = command({
evalPaths: restPositionals({
type: string,
displayName: 'eval-paths',
description: 'Path(s) or glob(s) to evaluation .yaml file(s)',
description: 'Path(s) or glob(s) to evaluation files (.yaml, .eval.ts)',
}),
target: multioption({
type: array(string),
Expand Down
84 changes: 58 additions & 26 deletions apps/cli/src/commands/eval/run-eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import {
loadConfig,
loadTestSuite,
loadTsConfig,
resolveTargetDefinition,
shouldEnableCache,
shouldSkipCacheForTemperature,
subscribeToCodexLogEntries,
Expand Down Expand Up @@ -531,6 +532,9 @@ async function prepareFileMetadata(params: {
readonly failOnError?: FailOnError;
readonly threshold?: number;
readonly tags?: readonly string[];
readonly providerFactory?: (
target: import('@agentv/core').ResolvedTarget,
) => import('@agentv/core').Provider;
}> {
const { testFilePath, repoRoot, cwd, options } = params;

Expand Down Expand Up @@ -574,6 +578,54 @@ async function prepareFileMetadata(params: {
inlineTargetLabel: `transcript (${path.basename(options.transcript)})`,
},
];
} else if (suite.inlineTarget && options.cliTargets.length === 0) {
const targetDefinition = suite.inlineTarget;
const resolvedTarget = options.dryRun
? ({
kind: 'mock',
name: `${targetDefinition.name}-dry-run`,
graderTarget: undefined,
config: {
response: '{"answer":"Mock dry-run response"}',
delayMs: options.dryRunDelay,
delayMinMs: options.dryRunDelayMin,
delayMaxMs: options.dryRunDelayMax,
},
} satisfies ResolvedTarget)
: resolveTargetDefinition(targetDefinition, process.env, testFilePath, {
emitDeprecationWarnings: false,
});
selections = [
{
selection: {
definitions: [targetDefinition],
resolvedTarget,
targetName: targetDefinition.name,
targetSource: 'test-file',
targetsFilePath: testFilePath,
},
inlineTargetLabel: resolveTargetLabel(targetDefinition.name, resolvedTarget.name),
},
];
} else if (suite.providerFactory && options.cliTargets.length === 0) {
const taskTarget: ResolvedTarget = {
kind: 'mock',
name: 'custom-task',
graderTarget: undefined,
config: {},
};
selections = [
{
selection: {
definitions: [],
resolvedTarget: taskTarget,
targetName: 'custom-task',
targetSource: 'test-file',
targetsFilePath: testFilePath,
},
inlineTargetLabel: 'custom-task',
},
];
} else {
// Determine target names: CLI --target flags override YAML
const cliTargets = options.cliTargets;
Expand Down Expand Up @@ -658,6 +710,7 @@ async function prepareFileMetadata(params: {
failOnError: suite.failOnError,
threshold: suite.threshold,
tags: suite.metadata?.tags,
providerFactory: suite.providerFactory,
};
}

Expand Down Expand Up @@ -1170,33 +1223,12 @@ export async function runEvalCommand(
readonly failOnError?: FailOnError;
readonly threshold?: number;
readonly tags?: readonly string[];
readonly providerFactory?: (
target: import('@agentv/core').ResolvedTarget,
) => import('@agentv/core').Provider;
}
>();
// Separate TypeScript/JS eval files from YAML files.
// TS files are self-contained scripts that call evaluate() directly.
const tsFiles: string[] = [];
const yamlFiles: string[] = [];
for (const testFilePath of resolvedTestFiles) {
if (/\.(ts|js|mts|mjs)$/.test(testFilePath)) {
tsFiles.push(testFilePath);
} else {
yamlFiles.push(testFilePath);
}
}

// Run TypeScript eval files by importing them.
// evaluate() runs during import via top-level await and handles its own output.
for (const tsFile of tsFiles) {
await ensureFileExists(tsFile, 'TypeScript eval file');
await import(pathToFileURL(tsFile).href);
}

// If only TS files were provided, we're done — evaluate() handled everything.
if (yamlFiles.length === 0 && tsFiles.length > 0) {
return;
}

for (const testFilePath of yamlFiles) {
const meta = await prepareFileMetadata({
testFilePath,
repoRoot,
Expand Down Expand Up @@ -1355,7 +1387,7 @@ export async function runEvalCommand(
}
}

// Use only files that survived tag filtering (fileMetadata keys)
// Use only files that survived tag filtering.
const activeTestFiles = resolvedTestFiles.filter((f) => fileMetadata.has(f));

// --transcript: create a shared TranscriptProvider and validate entry count
Expand Down Expand Up @@ -1442,7 +1474,7 @@ export async function runEvalCommand(
budgetUsd: targetPrep.budgetUsd,
failOnError: targetPrep.failOnError,
threshold: resolvedThreshold,
providerFactory: transcriptProviderFactory,
providerFactory: transcriptProviderFactory ?? targetPrep.providerFactory,
});
const evalFile = path.relative(cwd, testFilePath);
const existingSummary = remoteEvalSummaries.find(
Expand Down
13 changes: 9 additions & 4 deletions apps/cli/src/commands/eval/shared.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,16 @@ export async function resolveEvalPaths(evalPaths: string[], cwd: string): Promis
: path.resolve(cwd, pattern);
try {
const stats = await stat(candidatePath);
if (stats.isFile() && /\.(ya?ml|jsonl|json)$/i.test(candidatePath)) {
if (stats.isFile() && /\.(ya?ml|jsonl|json|[cm]?ts)$/i.test(candidatePath)) {
results.add(candidatePath);
continue;
}
if (stats.isDirectory()) {
// Auto-expand directory to recursive eval file glob
const dirGlob = path.posix.join(candidatePath.replace(/\\/g, '/'), '**/*.eval.{yaml,yml}');
const dirGlob = path.posix.join(
candidatePath.replace(/\\/g, '/'),
'**/{*.eval.yaml,*.eval.yml,eval.yaml,eval.yml,*.eval.ts,*.eval.mts}',
);
const dirMatches = await fg(dirGlob, {
absolute: true,
onlyFiles: true,
Expand Down Expand Up @@ -69,7 +72,9 @@ export async function resolveEvalPaths(evalPaths: string[], cwd: string): Promis
ignore: ignorePatterns,
});

const yamlMatches = matches.filter((filePath) => /\.(ya?ml|jsonl|json)$/i.test(filePath));
const yamlMatches = matches.filter((filePath) =>
/\.(ya?ml|jsonl|json|[cm]?ts)$/i.test(filePath),
);
for (const filePath of yamlMatches) {
results.add(path.normalize(filePath));
}
Expand All @@ -94,7 +99,7 @@ export async function resolveEvalPaths(evalPaths: string[], cwd: string): Promis
throw new Error(
`No eval files matched any provided paths or globs: ${includePatterns.join(
', ',
)}. Provide YAML, JSONL, or JSON paths or globs (e.g., "evals/**/*.yaml", "evals/**/*.jsonl", "evals.json").`,
)}. Provide YAML, JSONL, JSON, or TypeScript paths or globs (e.g., "evals/**/eval.yaml", "evals/**/*.eval.ts").`,
);
}

Expand Down
49 changes: 49 additions & 0 deletions apps/cli/test/commands/eval/shared.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -64,4 +64,53 @@ describe('resolveEvalPaths', () => {
resolveEvalPaths(['evals/**/*.eval.yaml', 'evals/**/eval.yaml'], tempDir),
).rejects.toThrow('No eval files matched any provided paths or globs');
});

it('discovers *.eval.ts files from directory auto-expansion', async () => {
const evalDir = path.join(tempDir, 'evals');
mkdirSync(evalDir, { recursive: true });

const tsFile = path.join(evalDir, 'greeting.eval.ts');
writeFileSync(tsFile, 'export default { tests: [] }');

const resolved = await resolveEvalPaths([tempDir], tempDir);

expect(resolved).toEqual([path.normalize(tsFile)]);
});

it('accepts a direct .mts file path', async () => {
const tsFile = path.join(tempDir, 'custom.eval.mts');
writeFileSync(tsFile, 'export default { tests: [] }');

const resolved = await resolveEvalPaths([tsFile], tempDir);

expect(resolved).toEqual([path.normalize(tsFile)]);
});

it('accepts a direct .ts file path', async () => {
const tsFile = path.join(tempDir, 'custom.eval.ts');
writeFileSync(tsFile, 'export default { tests: [] }');

const resolved = await resolveEvalPaths([tsFile], tempDir);

expect(resolved).toEqual([path.normalize(tsFile)]);
});

it('discovers both .yaml and .ts files from directory', async () => {
const evalDir = path.join(tempDir, 'evals');
mkdirSync(evalDir, { recursive: true });

const yamlFile = path.join(evalDir, 'suite.eval.yaml');
const evalYamlFile = path.join(evalDir, 'eval.yaml');
const tsFile = path.join(evalDir, 'suite.eval.ts');
writeFileSync(yamlFile, 'tests:\n - id: sample\n input: test\n');
writeFileSync(evalYamlFile, 'tests:\n - id: sample2\n input: test\n');
writeFileSync(tsFile, 'export default { tests: [] }');

const resolved = await resolveEvalPaths([tempDir], tempDir);

expect(resolved).toContain(path.normalize(yamlFile));
expect(resolved).toContain(path.normalize(evalYamlFile));
expect(resolved).toContain(path.normalize(tsFile));
expect(resolved).toHaveLength(3);
});
});
Loading
Loading