diff --git a/apps/cli/src/commands/import/promptfoo.test.ts b/apps/cli/src/commands/import/promptfoo.test.ts index 1e31df42..39b1e889 100644 --- a/apps/cli/src/commands/import/promptfoo.test.ts +++ b/apps/cli/src/commands/import/promptfoo.test.ts @@ -51,7 +51,8 @@ tests: expect(suite.tests[0]).toMatchObject({ id: 'capital', criteria: 'Capital answer stays deterministic', - input: 'Answer clearly: What is the capital of France?', + input: 'Answer clearly: {{question}}', + vars: { question: 'What is the capital of France?' }, assertions: [{ type: 'equals', value: 'Paris' }], metadata: { promptfoo: { @@ -95,7 +96,8 @@ tests: file://./tests.jsonl const yaml = await convertPromptfooToAgentvYaml(configPath); expect(yaml).toContain('# Converted from promptfoo config:'); expect(yaml).toContain('id: math'); - expect(yaml).toContain('input: "Please answer: What is 2 + 2?"'); + expect(yaml).toContain('input: "Please answer: {{question}}"'); + expect(yaml).toContain('vars:'); expect(yaml).toContain('type: equals'); }); @@ -129,7 +131,10 @@ tests: file://./tests.csv expect(suite.tests[0]).toMatchObject({ id: 'capital-question', criteria: 'Capital question', - input: 'Question: What is the capital of France?', + input: 'Question: {{question}}', + vars: { + question: 'What is the capital of France?', + }, assertions: [ { type: 'equals', value: 'Paris' }, { type: 'contains', value: 'Paris' }, diff --git a/apps/cli/src/commands/import/promptfoo.ts b/apps/cli/src/commands/import/promptfoo.ts index 968a733e..db7b45d2 100644 --- a/apps/cli/src/commands/import/promptfoo.ts +++ b/apps/cli/src/commands/import/promptfoo.ts @@ -74,6 +74,7 @@ interface AgentvAssertion { interface AgentvTest { readonly id: string; readonly input: AgentvInput; + readonly vars?: Record; readonly assertions?: readonly AgentvAssertion[]; readonly [key: string]: unknown; } @@ -825,7 +826,8 @@ async function buildAgentvTests(options: { } for (const prompt of promptSelection) { - const renderedInput = renderPrompt(prompt, effectiveVars, testOptions); + const importedVars = testOptions.disableVarExpansion ? undefined : effectiveVars; + const templatedInput = buildPromptTemplate(prompt, testOptions); const promptSuffix = promptSelection.length > 1 ? `--${sanitizeName(prompt.key || prompt.label)}` : ''; const metadata = buildPromptfooMetadata(rawTest, effectiveVars, prompt, effectiveTargets); @@ -838,7 +840,8 @@ async function buildAgentvTests(options: { const test: AgentvTest = { id: `${explicitId ?? baseId}${promptSuffix}`, ...(typeof rawTest.description === 'string' ? { criteria: rawTest.description } : {}), - input: renderedInput, + input: templatedInput, + ...(importedVars && Object.keys(importedVars).length > 0 ? { vars: importedVars } : {}), ...(convertedCaseAssertions.length > 0 ? { assertions: convertedCaseAssertions } : {}), ...(metadata ? { metadata } : {}), ...(execution ? { execution } : {}), @@ -970,52 +973,30 @@ function filterProviders( return matched.map((provider) => provider.targetName); } -function renderPrompt( +function buildPromptTemplate( prompt: PromptfooPrompt, - vars: Record, testOptions: PromptfooTestOptions, ): AgentvInput { const prefix = testOptions.prefix ?? ''; const suffix = testOptions.suffix ?? ''; if (typeof prompt.content === 'string') { - return `${prefix}${renderTemplate(prompt.content, vars)}${suffix}`; + return `${prefix}${preserveTemplate(prompt.content)}${suffix}`; } return prompt.content.map((message, index, allMessages) => ({ role: message.role, - content: `${index === 0 ? prefix : ''}${renderTemplate(message.content, vars)}${index === allMessages.length - 1 ? suffix : ''}`, + content: `${index === 0 ? prefix : ''}${preserveTemplate(message.content)}${index === allMessages.length - 1 ? suffix : ''}`, })); } -function renderTemplate(template: string, vars: Record) { +function preserveTemplate(template: string) { if (template.includes('{%') || template.includes('{#') || /\{\{[^}]*\|/.test(template)) { throw new Error( `Unsupported Nunjucks syntax in prompt '${template.slice(0, 80)}'. Use simple {{var}} templates or migrate manually`, ); } - - return template.replace(/\{\{\s*([^}]+?)\s*\}\}/g, (_match, expression: string) => { - const value = lookupPath(vars, expression.trim()); - if (value === undefined) { - return ''; - } - if (typeof value === 'string') return value; - return JSON.stringify(value); - }); -} - -function lookupPath( - value: JsonValue | Record, - expression: string, -): JsonValue | undefined { - if (!expression) return undefined; - return expression.split('.').reduce((current, part) => { - if (!current || typeof current !== 'object' || Array.isArray(current)) { - return undefined; - } - return (current as Record)[part]; - }, value as JsonValue); + return template; } function buildPromptfooMetadata( diff --git a/apps/web/src/content/docs/docs/evaluation/eval-files.mdx b/apps/web/src/content/docs/docs/evaluation/eval-files.mdx index dac72a07..c2b6e681 100644 --- a/apps/web/src/content/docs/docs/evaluation/eval-files.mdx +++ b/apps/web/src/content/docs/docs/evaluation/eval-files.mdx @@ -264,6 +264,34 @@ For local sources, omit `checkout.resolve`. If you need to pin the local clone t MY_REPO_LOCAL_PATH=/home/dev/repos/my-repo ``` +## Per-Test Template Variables + +Eval YAML also supports per-test `vars` for data-driven prompt templates. Use `{{name}}` placeholders in test-facing text fields, and AgentV resolves them when the suite loads. + +```yaml +input: "Answer clearly: {{question}}" + +tests: + - id: capital + vars: + question: What is the capital of France? + expected_answer: Paris + criteria: "Answers {{question}} correctly" + input: + - role: user + content: "Question: {{question}}" + expected_output: "{{expected_answer}}" +``` + +### Behavior + +- `vars` is defined per test as an object +- `{{name}}` and dotted paths like `{{ user.name }}` are supported +- Substitution applies to suite-level `input`, test `input`, `input_files`, `criteria`, `expected_output`, and conversation turn `input` / `expected_output` +- When the whole string is a single placeholder, the original JSON value is preserved +- Missing variables are left unchanged, so unrelated template syntax is not silently blanked out +- `vars` interpolation is separate from environment interpolation: `{{question}}` uses test data, `${{ PROJECT_NAME }}` uses environment variables + ## JSONL Format For large-scale evaluations, AgentV supports JSONL (JSON Lines) format. Each line is a single test: diff --git a/apps/web/src/content/docs/docs/tools/import.mdx b/apps/web/src/content/docs/docs/tools/import.mdx index 7655eadd..cece3d0f 100644 --- a/apps/web/src/content/docs/docs/tools/import.mdx +++ b/apps/web/src/content/docs/docs/tools/import.mdx @@ -46,6 +46,7 @@ Default output: `EVAL.yaml` beside the promptfoo config file. - inline tests and external YAML / JSON / JSONL / CSV test files - `defaultTest.assert` promoted to suite-level `assertions` - per-test `vars`, `description`, `threshold`, `metadata`, prompt filters, and provider filters +- simple prompt templates are preserved as AgentV `{{var}}` input templates instead of being eagerly flattened - deterministic assertions that map directly to AgentV: `equals`, `contains`, `icontains`, `regex`, `starts-with`, `ends-with`, `contains-any`, `contains-all`, `icontains-any`, `icontains-all`, `is-json`, `latency`, `cost` - rubric-style assertions mapped to `llm-grader`: `llm-rubric`, `g-eval`, `factuality`, `context-faithfulness`, `context-recall` diff --git a/packages/core/src/evaluation/interpolation.ts b/packages/core/src/evaluation/interpolation.ts index 7bd2dbc1..e8b39622 100644 --- a/packages/core/src/evaluation/interpolation.ts +++ b/packages/core/src/evaluation/interpolation.ts @@ -1,6 +1,8 @@ import type { EnvLookup } from './providers/types.js'; const ENV_VAR_PATTERN = /\$\{\{\s*([A-Za-z_][A-Za-z0-9_]*)\s*\}\}/g; +const TEMPLATE_VAR_PATTERN = /\{\{\s*([A-Za-z_][A-Za-z0-9_.]*)\s*\}\}/g; +const WHOLE_TEMPLATE_VAR_PATTERN = /^\{\{\s*([A-Za-z_][A-Za-z0-9_.]*)\s*\}\}$/; /** * Regex that matches a string consisting of exactly one `${{ VAR }}` reference @@ -29,6 +31,42 @@ function coercePrimitive(value: string): unknown { return value; } +function isPlainObject(value: unknown): value is Record { + return typeof value === 'object' && value !== null && !Array.isArray(value); +} + +function cloneTemplateValue(value: unknown): unknown { + if (Array.isArray(value)) { + return value.map((item) => cloneTemplateValue(item)); + } + if (isPlainObject(value)) { + const result: Record = {}; + for (const [key, nested] of Object.entries(value)) { + result[key] = cloneTemplateValue(nested); + } + return result; + } + return value; +} + +function stringifyTemplateValue(value: unknown): string { + if (typeof value === 'string') return value; + return JSON.stringify(value); +} + +function lookupTemplateVar( + vars: Readonly>, + expression: string, +): unknown | undefined { + if (!expression) return undefined; + return expression.split('.').reduce((current, segment) => { + if (!isPlainObject(current)) { + return undefined; + } + return current[segment]; + }, vars); +} + /** * Recursively interpolate `${{ VAR }}` references in all string values. * Missing variables resolve to empty string. @@ -71,3 +109,40 @@ export function interpolateEnv(value: unknown, env: EnvLookup): unknown { } return value; } + +/** + * Recursively interpolate `{{ var }}` references in string values using per-test vars. + * Missing variables are left unchanged so unrelated template syntaxes remain intact. + * When the whole string is a single variable reference, the original JSON value is preserved. + */ +export function interpolateTemplateVars( + value: unknown, + vars: Readonly>, +): unknown { + if (typeof value === 'string') { + const wholeMatch = WHOLE_TEMPLATE_VAR_PATTERN.exec(value); + if (wholeMatch) { + const resolved = lookupTemplateVar(vars, wholeMatch[1] as string); + return resolved === undefined ? value : cloneTemplateValue(resolved); + } + + return value.replace(TEMPLATE_VAR_PATTERN, (match, expression: string) => { + const resolved = lookupTemplateVar(vars, expression); + return resolved === undefined ? match : stringifyTemplateValue(resolved); + }); + } + + if (Array.isArray(value)) { + return value.map((item) => interpolateTemplateVars(item, vars)); + } + + if (isPlainObject(value)) { + const result: Record = {}; + for (const [key, nested] of Object.entries(value)) { + result[key] = interpolateTemplateVars(nested, vars); + } + return result; + } + + return value; +} diff --git a/packages/core/src/evaluation/validation/eval-file.schema.ts b/packages/core/src/evaluation/validation/eval-file.schema.ts index 66946323..81c2e59c 100644 --- a/packages/core/src/evaluation/validation/eval-file.schema.ts +++ b/packages/core/src/evaluation/validation/eval-file.schema.ts @@ -25,11 +25,13 @@ const MessageSchema = z.object({ content: MessageContentSchema, }); +const JsonObjectSchema = z.object({}).catchall(z.unknown()); + /** Input: string shorthand or message array */ const InputSchema = z.union([z.string(), z.array(MessageSchema)]); /** Expected output: string, object, or message array */ -const ExpectedOutputSchema = z.union([z.string(), z.record(z.unknown()), z.array(MessageSchema)]); +const ExpectedOutputSchema = z.union([z.string(), JsonObjectSchema, z.array(MessageSchema)]); // --------------------------------------------------------------------------- // Grader schemas (YAML input format) @@ -389,6 +391,7 @@ const ConversationTurnSchema = z.object({ const EvalTestSchema = z.object({ id: z.string().min(1), + vars: JsonObjectSchema.optional(), criteria: z.string().optional(), input: InputSchema.optional(), input_files: z.array(z.string()).optional(), diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts index ba25e993..18c863cc 100644 --- a/packages/core/src/evaluation/yaml-parser.ts +++ b/packages/core/src/evaluation/yaml-parser.ts @@ -3,7 +3,7 @@ import path from 'node:path'; import micromatch from 'micromatch'; import { collectResolvedInputFilePaths } from './input-message-utils.js'; -import { interpolateEnv } from './interpolation.js'; +import { interpolateEnv, interpolateTemplateVars } from './interpolation.js'; import { loadTestsFromAgentSkills } from './loaders/agent-skills-parser.js'; import { expandFileReferences, @@ -129,6 +129,7 @@ type RawTestSuite = JsonObject & { type RawEvalCase = JsonObject & { readonly id?: JsonValue; + readonly vars?: JsonValue; readonly conversation_id?: JsonValue; readonly criteria?: JsonValue; /** @deprecated Use `criteria` instead */ @@ -137,6 +138,7 @@ type RawEvalCase = JsonObject & { /** Shorthand: list of file paths to prepend as type:file content blocks in the user message. */ readonly input_files?: JsonValue; readonly expected_output?: JsonValue; + readonly evaluator?: JsonValue; readonly execution?: JsonValue; readonly evaluators?: JsonValue; readonly assertions?: JsonValue; @@ -145,6 +147,13 @@ type RawEvalCase = JsonObject & { readonly rubrics?: JsonValue; readonly workspace?: JsonValue; readonly metadata?: JsonValue; + readonly depends_on?: JsonValue; + readonly on_dependency_failure?: JsonValue; + readonly mode?: JsonValue; + readonly turns?: JsonValue; + readonly aggregation?: JsonValue; + readonly on_turn_failure?: JsonValue; + readonly window_size?: JsonValue; }; function resolveTests(suite: RawTestSuite): JsonValue | undefined { @@ -160,6 +169,59 @@ function resolveTests(suite: RawTestSuite): JsonValue | undefined { return undefined; } +function interpolateCaseField( + value: T, + vars: JsonObject | undefined, +): T { + if (!vars || value === undefined) { + return value; + } + return interpolateTemplateVars(value, vars as Record) as T; +} + +function interpolateCaseTurns( + turns: JsonValue | undefined, + vars: JsonObject | undefined, +): JsonValue | undefined { + if (!vars || !Array.isArray(turns)) { + return turns; + } + + return turns.map((rawTurn) => { + if (!isJsonObject(rawTurn)) { + return rawTurn; + } + + return { + ...rawTurn, + input: interpolateCaseField(rawTurn.input, vars), + expected_output: interpolateCaseField(rawTurn.expected_output, vars), + } satisfies JsonObject; + }); +} + +function interpolateRawEvalCase(raw: RawEvalCase, vars: JsonObject | undefined): RawEvalCase { + if (!vars) { + return raw; + } + + return { + ...raw, + ...(raw.criteria !== undefined ? { criteria: interpolateCaseField(raw.criteria, vars) } : {}), + ...(raw.expected_outcome !== undefined + ? { expected_outcome: interpolateCaseField(raw.expected_outcome, vars) } + : {}), + ...(raw.input !== undefined ? { input: interpolateCaseField(raw.input, vars) } : {}), + ...(raw.input_files !== undefined + ? { input_files: interpolateCaseField(raw.input_files, vars) } + : {}), + ...(raw.expected_output !== undefined + ? { expected_output: interpolateCaseField(raw.expected_output, vars) } + : {}), + ...(raw.turns !== undefined ? { turns: interpolateCaseTurns(raw.turns, vars) } : {}), + }; +} + /** * Read metadata from a test suite file (like target name). * This is a convenience function for CLI tools that need metadata without loading all tests. @@ -366,11 +428,8 @@ async function loadTestsFromYaml( // Merged into each case's `metadata.governance` via mergeSuiteMetadataPayload. const suiteGovernance = extractSuiteGovernance(suite); - // Resolve suite-level input (prepended to each test's input messages) - const suiteInputMessages = expandInputShorthand(suite.input); - - // Suite-level input_files: passed to resolveInputMessages for each test - const suiteInputFiles = suite.input_files; + const rawSuiteInput = suite.input; + const rawSuiteInputFiles = suite.input_files; // Extract global target from execution.target (or legacy root-level target) const rawGlobalExecution = isJsonObject(suite.execution) ? suite.execution : undefined; @@ -403,21 +462,22 @@ async function loadTestsFromYaml( continue; } - const conversationId = asString(testCaseConfig.conversation_id); - let outcome = asString(testCaseConfig.criteria); - if (!outcome && testCaseConfig.expected_outcome !== undefined) { - outcome = asString(testCaseConfig.expected_outcome); + const caseVars = isJsonObject(testCaseConfig.vars) ? testCaseConfig.vars : undefined; + const renderedCase = interpolateRawEvalCase(testCaseConfig, caseVars); + + const conversationId = asString(renderedCase.conversation_id); + let outcome = asString(renderedCase.criteria); + if (!outcome && renderedCase.expected_outcome !== undefined) { + outcome = asString(renderedCase.expected_outcome); if (outcome) { logWarning( - `Test '${asString(testCaseConfig.id) ?? 'unknown'}': 'expected_outcome' is deprecated. Use 'criteria' instead.`, + `Test '${asString(renderedCase.id) ?? 'unknown'}': 'expected_outcome' is deprecated. Use 'criteria' instead.`, ); } } // Extract per-case execution config early (reused below for skip_defaults) - const caseExecution = isJsonObject(testCaseConfig.execution) - ? testCaseConfig.execution - : undefined; + const caseExecution = isJsonObject(renderedCase.execution) ? renderedCase.execution : undefined; const skipDefaults = caseExecution?.skip_defaults === true; const caseThreshold = typeof caseExecution?.threshold === 'number' && @@ -427,18 +487,21 @@ async function loadTestsFromYaml( : undefined; // Resolve input with shorthand support (pass suite-level input_files for merge) - const effectiveSuiteInputFiles = suiteInputFiles && !skipDefaults ? suiteInputFiles : undefined; - const testInputMessages = resolveInputMessages(testCaseConfig, effectiveSuiteInputFiles); + const effectiveSuiteInputFiles = + rawSuiteInputFiles && !skipDefaults + ? interpolateCaseField(rawSuiteInputFiles, caseVars) + : undefined; + const testInputMessages = resolveInputMessages(renderedCase, effectiveSuiteInputFiles); // Resolve expected_output with shorthand support - const expectedMessages = resolveExpectedMessages(testCaseConfig) ?? []; + const expectedMessages = resolveExpectedMessages(renderedCase) ?? []; // A test is complete when it has id, input, and at least one of: criteria, expected_output, assertions, or turns (conversation mode) const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || - testCaseConfig.assertions !== undefined || - testCaseConfig.assert !== undefined || - (Array.isArray(testCaseConfig.turns) && testCaseConfig.turns.length > 0); + renderedCase.assertions !== undefined || + renderedCase.assert !== undefined || + (Array.isArray(renderedCase.turns) && renderedCase.turns.length > 0); if (!id || !hasEvaluationSpec || !testInputMessages || testInputMessages.length === 0) { logError( `Skipping incomplete test: ${id ?? 'unknown'}. Missing required fields: id, input, and at least one of criteria/expected_output/assertions/turns`, @@ -447,8 +510,9 @@ async function loadTestsFromYaml( } // Prepend suite-level input to test input (respecting skip_defaults) - const effectiveSuiteInputMessages = - suiteInputMessages && !skipDefaults ? suiteInputMessages : undefined; + const effectiveSuiteInputValue = + rawSuiteInput && !skipDefaults ? interpolateCaseField(rawSuiteInput, caseVars) : undefined; + const effectiveSuiteInputMessages = expandInputShorthand(effectiveSuiteInputValue); // expected_output is optional - for outcome-only evaluation const hasExpectedMessages = expectedMessages.length > 0; @@ -513,11 +577,11 @@ async function loadTestsFromYaml( .filter((part) => part.length > 0) .join(' '); - const testCaseEvaluatorKind = coerceEvaluator(testCaseConfig.evaluator, id) ?? globalEvaluator; + const testCaseEvaluatorKind = coerceEvaluator(renderedCase.evaluator, id) ?? globalEvaluator; let evaluators: Awaited>; try { evaluators = await parseGraders( - testCaseConfig, + renderedCase, globalExecution, searchRoots, id ?? 'unknown', @@ -531,7 +595,7 @@ async function loadTestsFromYaml( } // Handle inline rubrics field (deprecated: use assertions: [{type: rubrics, criteria: [...]}] instead) - const inlineRubrics = testCaseConfig.rubrics; + const inlineRubrics = renderedCase.rubrics; if (inlineRubrics !== undefined && Array.isArray(inlineRubrics)) { const rubricEvaluator = parseInlineRubrics(inlineRubrics); if (rubricEvaluator) { @@ -545,28 +609,28 @@ async function loadTestsFromYaml( const userFilePaths = collectResolvedInputFilePaths(inputMessages); // Parse per-case workspace config and merge with suite-level - const caseWorkspace = await resolveWorkspaceConfig(testCaseConfig.workspace, evalFileDir); + const caseWorkspace = await resolveWorkspaceConfig(renderedCase.workspace, evalFileDir); const mergedWorkspace = mergeWorkspaceConfigs(suiteWorkspace, caseWorkspace); // Parse per-case metadata, then merge suite-level metadata payload. // Arrays concatenate (suite-first, deduplicated), scalars on the case win. - const rawCaseMetadata = isJsonObject(testCaseConfig.metadata) - ? (testCaseConfig.metadata as Record) + const rawCaseMetadata = isJsonObject(renderedCase.metadata) + ? (renderedCase.metadata as Record) : undefined; const suitePayload = suiteGovernance !== undefined ? { governance: suiteGovernance } : undefined; const metadata = mergeSuiteMetadataPayload(rawCaseMetadata, suitePayload); // Extract per-test targets override (matrix evaluation) - const caseTargets = extractTargetsFromTestCase(testCaseConfig as JsonObject); + const caseTargets = extractTargetsFromTestCase(renderedCase as JsonObject); // Extract dependency fields - const dependsOn = Array.isArray(testCaseConfig.depends_on) - ? (testCaseConfig.depends_on as readonly string[]).filter( + const dependsOn = Array.isArray(renderedCase.depends_on) + ? (renderedCase.depends_on as readonly string[]).filter( (v): v is string => typeof v === 'string', ) : undefined; - const onDependencyFailureRaw = asString(testCaseConfig.on_dependency_failure); + const onDependencyFailureRaw = asString(renderedCase.on_dependency_failure); const onDependencyFailure = onDependencyFailureRaw === 'skip' || onDependencyFailureRaw === 'fail' || @@ -575,23 +639,23 @@ async function loadTestsFromYaml( : undefined; // Extract conversation mode fields - const modeRaw = asString(testCaseConfig.mode); + const modeRaw = asString(renderedCase.mode); const mode: ConversationMode | undefined = modeRaw === 'conversation' ? 'conversation' : undefined; - const turns = Array.isArray(testCaseConfig.turns) - ? parseTurns(testCaseConfig.turns as readonly unknown[]) + const turns = Array.isArray(renderedCase.turns) + ? parseTurns(renderedCase.turns as readonly unknown[]) : undefined; - const aggregationRaw = asString(testCaseConfig.aggregation); + const aggregationRaw = asString(renderedCase.aggregation); const aggregation: ConversationAggregation | undefined = aggregationRaw === 'mean' || aggregationRaw === 'min' || aggregationRaw === 'max' ? aggregationRaw : undefined; - const onTurnFailureRaw = asString(testCaseConfig.on_turn_failure); + const onTurnFailureRaw = asString(renderedCase.on_turn_failure); const onTurnFailure: TurnFailurePolicy | undefined = onTurnFailureRaw === 'continue' || onTurnFailureRaw === 'stop' ? onTurnFailureRaw : undefined; const windowSize = - typeof testCaseConfig.window_size === 'number' && testCaseConfig.window_size >= 1 - ? (testCaseConfig.window_size as number) + typeof renderedCase.window_size === 'number' && renderedCase.window_size >= 1 + ? (renderedCase.window_size as number) : undefined; const testCase: EvalTest = { diff --git a/packages/core/test/evaluation/interpolation.test.ts b/packages/core/test/evaluation/interpolation.test.ts index d416f7d0..ecaccf29 100644 --- a/packages/core/test/evaluation/interpolation.test.ts +++ b/packages/core/test/evaluation/interpolation.test.ts @@ -1,5 +1,5 @@ import { describe, expect, it } from 'vitest'; -import { interpolateEnv } from '../../src/evaluation/interpolation.js'; +import { interpolateEnv, interpolateTemplateVars } from '../../src/evaluation/interpolation.js'; describe('interpolateEnv', () => { const env = { HOME: '/home/user', PROJECT: 'agentv', EMPTY: '' }; @@ -135,3 +135,31 @@ describe('interpolateEnv', () => { expect(interpolateEnv('${{ MY_VAR_2 }}', envWithSpecial)).toBe('value'); }); }); + +describe('interpolateTemplateVars', () => { + const vars = { + question: 'What is 2 + 2?', + nested: { topic: 'math' }, + expected: { answer: '4' }, + }; + + it('replaces {{ var }} in strings', () => { + expect(interpolateTemplateVars('Answer clearly: {{question}}', vars)).toBe( + 'Answer clearly: What is 2 + 2?', + ); + }); + + it('supports dotted paths', () => { + expect(interpolateTemplateVars('Topic: {{ nested.topic }}', vars)).toBe('Topic: math'); + }); + + it('preserves missing variables instead of blanking them out', () => { + expect(interpolateTemplateVars('Answer clearly: {{missing}}', vars)).toBe( + 'Answer clearly: {{missing}}', + ); + }); + + it('returns the original JSON value for whole-value substitutions', () => { + expect(interpolateTemplateVars('{{expected}}', vars)).toEqual({ answer: '4' }); + }); +}); diff --git a/packages/core/test/evaluation/suite-level-input.test.ts b/packages/core/test/evaluation/suite-level-input.test.ts index d23a77fe..eebf53f8 100644 --- a/packages/core/test/evaluation/suite-level-input.test.ts +++ b/packages/core/test/evaluation/suite-level-input.test.ts @@ -191,4 +191,78 @@ tests: expect(tests[0].input[2]).toEqual({ role: 'assistant', content: 'I understand.' }); expect(tests[0].input[3]).toEqual({ role: 'user', content: 'Follow-up question' }); }); + + it('applies per-test vars to suite and test input templates', async () => { + await writeFile( + path.join(tempDir, 'templated-input.eval.yaml'), + `input: "Answer clearly: {{question}}" +tests: + - id: templated + vars: + question: "What is the capital of France?" + criteria: "Answers {{question}} correctly" + input: + - role: user + content: "Question: {{question}}" + - role: assistant + content: "Thinking about {{question}}" + - role: user + content: "Final answer only." + expected_output: "{{expected_answer}}" + metadata: + untouched: "{{question}}" +`, + ); + + const tests = await loadTests(path.join(tempDir, 'templated-input.eval.yaml'), tempDir); + + expect(tests).toHaveLength(1); + expect(tests[0].criteria).toBe('Answers What is the capital of France? correctly'); + expect(tests[0].question).toContain('Answer clearly: What is the capital of France?'); + expect(tests[0].input[0]).toEqual({ + role: 'user', + content: 'Answer clearly: What is the capital of France?', + }); + expect(tests[0].input[1]).toEqual({ + role: 'user', + content: 'Question: What is the capital of France?', + }); + expect(tests[0].input[2]).toEqual({ + role: 'assistant', + content: 'Thinking about What is the capital of France?', + }); + expect(tests[0].expected_output).toEqual([ + { role: 'assistant', content: '{{expected_answer}}' }, + ]); + expect(tests[0].metadata).toEqual({ untouched: '{{question}}' }); + }); + + it('applies per-test vars inside conversation turns', async () => { + await writeFile( + path.join(tempDir, 'templated-turns.eval.yaml'), + `tests: + - id: conversation + vars: + bug: parser null check + mode: conversation + input: "Fix {{bug}}" + turns: + - input: "Fix {{bug}}" + expected_output: "Fixed {{bug}}" + assertions: + - "Mentions {{bug}}" +`, + ); + + const tests = await loadTests(path.join(tempDir, 'templated-turns.eval.yaml'), tempDir); + + expect(tests).toHaveLength(1); + expect(tests[0].turns).toEqual([ + { + input: 'Fix parser null check', + expected_output: 'Fixed parser null check', + assertions: ['Mentions {{bug}}'], + }, + ]); + }); }); diff --git a/skills-data/agentv-eval-writer/references/eval-schema.json b/skills-data/agentv-eval-writer/references/eval-schema.json index 2f6fd88a..3a26739e 100644 --- a/skills-data/agentv-eval-writer/references/eval-schema.json +++ b/skills-data/agentv-eval-writer/references/eval-schema.json @@ -106,6 +106,11 @@ "type": "string", "minLength": 1 }, + "vars": { + "type": "object", + "properties": {}, + "additionalProperties": {} + }, "criteria": { "type": "string" }, @@ -167,6 +172,7 @@ }, { "type": "object", + "properties": {}, "additionalProperties": {} }, { @@ -6634,6 +6640,11 @@ "type": "string", "minLength": 1 }, + "vars": { + "type": "object", + "properties": {}, + "additionalProperties": {} + }, "criteria": { "type": "string" }, @@ -6695,6 +6706,7 @@ }, { "type": "object", + "properties": {}, "additionalProperties": {} }, {