diff --git a/examples/features/sdk-programmatic-api-advanced/README.md b/examples/features/sdk-programmatic-api-advanced/README.md new file mode 100644 index 00000000..badba03e --- /dev/null +++ b/examples/features/sdk-programmatic-api-advanced/README.md @@ -0,0 +1,16 @@ +# SDK Programmatic API — Advanced + +Demonstrates the advanced programmatic API features added in [#1115](https://github.com/anthropics/agentv/issues/1115): + +- **`beforeAll`** — run setup commands before the suite starts +- **`budgetUsd`** — cap total LLM spend +- **`turns`** — multi-turn conversation evaluation +- **`aggregation`** — control how turn scores combine (`mean`, `min`, `max`) + +## Run + +```bash +bun run evaluate.ts +``` + +See also: [`sdk-programmatic-api`](../sdk-programmatic-api/) for the basic API. diff --git a/examples/features/sdk-programmatic-api-advanced/evaluate.ts b/examples/features/sdk-programmatic-api-advanced/evaluate.ts new file mode 100644 index 00000000..d1e3ac64 --- /dev/null +++ b/examples/features/sdk-programmatic-api-advanced/evaluate.ts @@ -0,0 +1,56 @@ +/** + * Advanced Programmatic API Example + * + * Demonstrates evaluate() with beforeAll, budgetUsd, multi-turn conversations, + * and aggregation — all defined in TypeScript with full type safety. + * + * Run: bun run evaluate.ts + */ +import { evaluate } from '@agentv/core'; + +const { results, summary } = await evaluate({ + // Run a setup command before the suite starts + beforeAll: 'echo "Setting up test environment"', + + // Cap total LLM spend at $5 + budgetUsd: 5.0, + + tests: [ + // Standard single-turn test (unchanged from basic API) + { + id: 'greeting', + input: 'Say hello.', + assert: [{ type: 'contains', value: 'Hello' }], + }, + + // Multi-turn conversation test + { + id: 'multi-turn-memory', + mode: 'conversation', + turns: [ + { + input: 'Hi, my name is Alice.', + assert: [{ type: 'contains', value: 'Alice' }], + }, + { + input: 'What is my name?', + expectedOutput: 'Your name is Alice.', + assert: [{ type: 'contains', value: 'Alice' }], + }, + ], + // Use weakest-link scoring: final score = lowest turn score + aggregation: 'min', + }, + ], + + onResult: (result) => { + console.log(` ${result.testId}: score=${result.score.toFixed(2)}`); + }, +}); + +console.log('\n--- Summary ---'); +console.log(`Total: ${summary.total}`); +console.log(`Passed: ${summary.passed}`); +console.log(`Failed: ${summary.failed}`); +console.log(`Mean score: ${summary.meanScore.toFixed(2)}`); +console.log(`Duration: ${summary.durationMs}ms`); diff --git a/examples/features/sdk-programmatic-api-advanced/package.json b/examples/features/sdk-programmatic-api-advanced/package.json new file mode 100644 index 00000000..8311e3fe --- /dev/null +++ b/examples/features/sdk-programmatic-api-advanced/package.json @@ -0,0 +1,8 @@ +{ + "name": "agentv-example-sdk-programmatic-api-advanced", + "private": true, + "type": "module", + "dependencies": { + "@agentv/core": "file:../../../packages/core" + } +} diff --git a/packages/core/src/evaluation/evaluate.ts b/packages/core/src/evaluation/evaluate.ts index 1aab6886..ff59670d 100644 --- a/packages/core/src/evaluation/evaluate.ts +++ b/packages/core/src/evaluation/evaluate.ts @@ -69,10 +69,13 @@ import { type ResolvedTarget, resolveTargetDefinition } from './providers/target import type { TargetDefinition } from './providers/types.js'; import { INLINE_ASSERT_FN } from './registry/builtin-graders.js'; import type { + ConversationAggregation, + ConversationTurn, EvalTest, EvaluationResult, GraderConfig, InlineAssertEvaluatorConfig, + WorkspaceHookConfig, } from './types.js'; import { loadTests } from './yaml-parser.js'; @@ -85,8 +88,8 @@ export interface EvalTestInput { readonly id: string; /** What the response should accomplish */ readonly criteria?: string; - /** Input to the agent (string or message array) */ - readonly input: string | readonly { role: string; content: string }[]; + /** Input to the agent (string or message array). Omit when using turns[]. */ + readonly input?: string | readonly { role: string; content: string }[]; /** Expected reference output (camelCase preferred) */ readonly expectedOutput?: string; /** @deprecated Use `expectedOutput` instead */ @@ -95,6 +98,27 @@ export interface EvalTestInput { readonly assert?: readonly AssertEntry[]; /** Arbitrary metadata */ readonly metadata?: Record; + /** Enable multi-turn conversation mode. Inferred automatically when turns[] is provided. */ + readonly mode?: 'conversation'; + /** Ordered turns for conversation evaluation. Each turn generates a fresh LLM call. */ + readonly turns?: readonly ConversationTurnInput[]; + /** Score aggregation across turns: 'mean' (default), 'min', or 'max'. */ + readonly aggregation?: ConversationAggregation; +} + +/** + * A single turn in a multi-turn conversation evaluation (programmatic API). + * Mirrors the YAML `turns` structure with camelCase naming. + */ +export interface ConversationTurnInput { + /** Input for this turn (string or message array) */ + readonly input: string | readonly { role: string; content: string }[]; + /** Expected reference output for this turn */ + readonly expectedOutput?: string; + /** @deprecated Use `expectedOutput` instead */ + readonly expected_output?: string; + /** Per-turn assertions (string criteria or grader config) */ + readonly assert?: readonly AssertEntry[]; } /** @@ -162,6 +186,10 @@ export interface EvalConfig { readonly onResult?: (result: EvaluationResult) => void; /** Score threshold for pass/fail (0-1). Default: 0.8 (DEFAULT_THRESHOLD). */ readonly threshold?: number; + /** Command(s) to run once before the suite starts. Same semantics as YAML before_all. */ + readonly beforeAll?: string | readonly string[]; + /** Suite-level cost cap in USD. Stops dispatching new tests when exceeded. */ + readonly budgetUsd?: number; } /** @@ -279,17 +307,27 @@ export async function evaluate(config: EvalConfig): Promise { filter: config.filter, }); } else { + // Build workspace config with before_all hook if beforeAll is provided + const suiteWorkspace = config.beforeAll + ? { hooks: { before_all: toBeforeAllHook(config.beforeAll) } } + : undefined; + // Inline mode: convert EvalTestInput[] to EvalTest[] evalCases = (config.tests ?? []).map((test): EvalTest => { - const input = - typeof test.input === 'string' - ? ([{ role: 'user' as const, content: test.input }] as EvalTest['input']) - : (test.input as unknown as EvalTest['input']); + // Conversation mode: use turns[] for input/question derivation + const isConversation = test.mode === 'conversation' || (test.turns && test.turns.length > 0); + + if (!isConversation && !test.input) { + throw new Error(`Test '${test.id}': input is required for non-conversation tests`); + } + + const input = isConversation + ? toMessageArray(test.turns?.[0]?.input ?? '') + : toMessageArray(test.input ?? ''); - const question = - typeof test.input === 'string' - ? test.input - : (test.input.find((m) => m.role === 'user')?.content ?? ''); + const question = isConversation + ? extractQuestion(test.turns?.[0]?.input ?? '') + : extractQuestion(test.input ?? ''); const expectedOutputValue = test.expectedOutput ?? test.expected_output; const expectedOutput = expectedOutputValue @@ -300,24 +338,19 @@ export async function evaluate(config: EvalConfig): Promise { // Convert inline assertions to evaluator config format const allAssertions = [...(test.assert ?? []), ...(config.assert ?? [])]; - const assertConfigs = allAssertions.map((entry, i) => { - if (typeof entry === 'function') { - // Wrap AssertFn as InlineAssertEvaluatorConfig with function attached via Symbol - const base: InlineAssertEvaluatorConfig = { - type: 'inline-assert', - name: `inline-assert-${i}`, - }; - return Object.assign(base, { - [INLINE_ASSERT_FN]: entry as AssertFn, - }) as unknown as GraderConfig; - } - const a = entry as EvalAssertionInput; - const { type: rawType, ...rest } = a; + const assertConfigs = convertAssertions(allAssertions); + + // Convert conversation turns if present — keep input/expected_output as + // TestMessageContent (matching YAML parser behavior), not wrapped in message arrays. + const turns: ConversationTurn[] | undefined = test.turns?.map((turn) => { + const turnExpected = turn.expectedOutput ?? turn.expected_output; return { - ...rest, - name: a.name ?? `${rawType}_${i}`, - type: mapAssertionType(rawType), - } as unknown as GraderConfig; + input: turn.input as ConversationTurn['input'], + ...(turnExpected !== undefined && { + expected_output: turnExpected as ConversationTurn['expected_output'], + }), + assertions: turn.assert ? convertAssertions([...turn.assert]) : undefined, + }; }); return { @@ -330,6 +363,10 @@ export async function evaluate(config: EvalConfig): Promise { file_paths: [], assertions: assertConfigs.length > 0 ? assertConfigs : undefined, metadata: test.metadata, + ...(suiteWorkspace && { workspace: suiteWorkspace }), + ...(isConversation && { mode: 'conversation' as const }), + ...(turns && { turns }), + ...(test.aggregation && { aggregation: test.aggregation }), }; }); } @@ -348,6 +385,7 @@ export async function evaluate(config: EvalConfig): Promise { filter: config.filter, threshold: config.threshold, evalCases, + ...(config.budgetUsd !== undefined && { budgetUsd: config.budgetUsd }), onResult: async (result) => { collectedResults.push(result); config.onResult?.(result); @@ -363,6 +401,59 @@ export async function evaluate(config: EvalConfig): Promise { }; } +/** + * Convert a flexible input (string or message array) to the internal TestMessage[] format. + */ +function toMessageArray( + input: string | readonly { role: string; content: string }[], +): EvalTest['input'] { + if (typeof input === 'string') { + return [{ role: 'user' as const, content: input }] as EvalTest['input']; + } + return input as unknown as EvalTest['input']; +} + +/** + * Extract the user-facing question string from a flexible input. + */ +function extractQuestion(input: string | readonly { role: string; content: string }[]): string { + if (typeof input === 'string') return input; + return input.find((m) => m.role === 'user')?.content ?? ''; +} + +/** + * Convert programmatic API beforeAll (string | string[]) to internal WorkspaceHookConfig. + * Accepts a shell command string or an array of command tokens. + */ +function toBeforeAllHook(beforeAll: string | readonly string[]): WorkspaceHookConfig { + const command = typeof beforeAll === 'string' ? ['sh', '-c', beforeAll] : [...beforeAll]; + return { command }; +} + +/** + * Convert an array of assert entries (inline functions or config objects) to GraderConfig[]. + */ +function convertAssertions(entries: readonly AssertEntry[]): GraderConfig[] { + return entries.map((entry, i) => { + if (typeof entry === 'function') { + const base: InlineAssertEvaluatorConfig = { + type: 'inline-assert', + name: `inline-assert-${i}`, + }; + return Object.assign(base, { + [INLINE_ASSERT_FN]: entry as AssertFn, + }) as unknown as GraderConfig; + } + const a = entry as EvalAssertionInput; + const { type: rawType, ...rest } = a; + return { + ...rest, + name: a.name ?? `${rawType}_${i}`, + type: mapAssertionType(rawType), + } as unknown as GraderConfig; + }); +} + /** * Map user-facing assertion type names to internal grader type names. * Handles snake_case to kebab-case normalization (e.g., 'llm_grader' -> 'llm-grader'). diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index 3fcae757..2719b941 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -28,6 +28,7 @@ export * from './evaluation/orchestrator.js'; export { evaluate, type AssertEntry, + type ConversationTurnInput, type EvalConfig, type EvalTestInput, type EvalAssertionInput, diff --git a/packages/core/test/evaluation/evaluate-programmatic-api.test.ts b/packages/core/test/evaluation/evaluate-programmatic-api.test.ts new file mode 100644 index 00000000..9a91c9e6 --- /dev/null +++ b/packages/core/test/evaluation/evaluate-programmatic-api.test.ts @@ -0,0 +1,241 @@ +/** + * Tests for the programmatic API extensions: beforeAll, budgetUsd, turns, aggregation. + * + * Validates that the new EvalConfig and EvalTestInput fields are accepted by + * evaluate() and correctly converted to internal EvalTest / RunEvaluationOptions. + */ + +import { describe, expect, it } from 'bun:test'; +import { evaluate } from '../../src/evaluation/evaluate.js'; + +describe('evaluate() — programmatic API extensions', () => { + // --------------------------------------------------------------------------- + // budgetUsd + // --------------------------------------------------------------------------- + + it('accepts budgetUsd and passes it to the orchestrator', async () => { + const { summary } = await evaluate({ + tests: [ + { + id: 'budget-test', + input: 'hello', + assert: [{ type: 'contains', value: 'hello' }], + }, + ], + target: { name: 'default', provider: 'mock', response: 'hello world' }, + budgetUsd: 10.0, + }); + expect(summary.passed).toBe(1); + }); + + // --------------------------------------------------------------------------- + // turns + mode: 'conversation' + // --------------------------------------------------------------------------- + + it('accepts turns with explicit conversation mode', async () => { + const { summary, results } = await evaluate({ + tests: [ + { + id: 'conversation-explicit', + mode: 'conversation', + turns: [ + { + input: 'Hello', + assert: [{ type: 'contains', value: 'mock' }], + }, + { + input: 'How are you?', + assert: [{ type: 'contains', value: 'mock' }], + }, + ], + }, + ], + target: { name: 'default', provider: 'mock', response: 'mock response' }, + }); + expect(summary.total).toBe(1); + expect(results.length).toBe(1); + }); + + it('infers conversation mode when turns[] is provided without explicit mode', async () => { + const { summary } = await evaluate({ + tests: [ + { + id: 'conversation-inferred', + turns: [ + { + input: 'First turn', + assert: [{ type: 'contains', value: 'mock' }], + }, + ], + }, + ], + target: { name: 'default', provider: 'mock', response: 'mock response' }, + }); + expect(summary.total).toBe(1); + }); + + it('supports expectedOutput on individual turns', async () => { + const { summary } = await evaluate({ + tests: [ + { + id: 'turn-expected-output', + turns: [ + { + input: 'Say hello', + expectedOutput: 'Hello!', + assert: [{ type: 'contains', value: 'mock' }], + }, + ], + }, + ], + target: { name: 'default', provider: 'mock', response: 'mock response' }, + }); + expect(summary.total).toBe(1); + }); + + it('supports message array input in turns', async () => { + const { summary } = await evaluate({ + tests: [ + { + id: 'turn-message-array', + turns: [ + { + input: [ + { role: 'system', content: 'You are helpful' }, + { role: 'user', content: 'Hello' }, + ], + assert: [{ type: 'contains', value: 'mock' }], + }, + ], + }, + ], + target: { name: 'default', provider: 'mock', response: 'mock response' }, + }); + expect(summary.total).toBe(1); + }); + + // --------------------------------------------------------------------------- + // aggregation + // --------------------------------------------------------------------------- + + it('accepts aggregation on conversation tests', async () => { + const { summary } = await evaluate({ + tests: [ + { + id: 'aggregation-min', + turns: [ + { + input: 'Turn 1', + assert: [{ type: 'contains', value: 'mock' }], + }, + { + input: 'Turn 2', + assert: [{ type: 'contains', value: 'mock' }], + }, + ], + aggregation: 'min', + }, + ], + target: { name: 'default', provider: 'mock', response: 'mock response' }, + }); + expect(summary.total).toBe(1); + }); + + // --------------------------------------------------------------------------- + // beforeAll + // --------------------------------------------------------------------------- + + it('accepts beforeAll as a string', async () => { + // beforeAll requires a workspace to execute in; without repos it just attaches + // the hook config. This test verifies the type is accepted without throwing. + const { summary } = await evaluate({ + tests: [ + { + id: 'before-all-string', + input: 'hello', + assert: [{ type: 'contains', value: 'test' }], + }, + ], + target: { name: 'default', provider: 'mock', response: 'test output' }, + beforeAll: 'echo "setup complete"', + }); + expect(summary.total).toBe(1); + }); + + it('accepts beforeAll as a string array', async () => { + const { summary } = await evaluate({ + tests: [ + { + id: 'before-all-array', + input: 'hello', + assert: [{ type: 'contains', value: 'test' }], + }, + ], + target: { name: 'default', provider: 'mock', response: 'test output' }, + beforeAll: ['echo', 'setup complete'], + }); + expect(summary.total).toBe(1); + }); + + // --------------------------------------------------------------------------- + // Combined usage + // --------------------------------------------------------------------------- + + it('supports all new fields together', async () => { + const { summary } = await evaluate({ + tests: [ + { + id: 'combined-test', + turns: [ + { + input: 'Hello', + expectedOutput: 'Hi there', + assert: [{ type: 'contains', value: 'mock' }], + }, + { + input: 'Goodbye', + assert: [{ type: 'contains', value: 'mock' }], + }, + ], + aggregation: 'mean', + }, + ], + target: { name: 'default', provider: 'mock', response: 'mock response' }, + budgetUsd: 5.0, + beforeAll: 'echo "setup"', + }); + expect(summary.total).toBe(1); + }); + + // --------------------------------------------------------------------------- + // Backwards compatibility: input still works as before + // --------------------------------------------------------------------------- + + it('still works with standard single-turn input', async () => { + const { summary } = await evaluate({ + tests: [ + { + id: 'standard-input', + input: 'hello', + assert: [{ type: 'contains', value: 'hello' }], + }, + ], + target: { name: 'default', provider: 'mock', response: 'hello world' }, + }); + expect(summary.passed).toBe(1); + }); + + // --------------------------------------------------------------------------- + // Validation + // --------------------------------------------------------------------------- + + it('throws when input is missing on a non-conversation test', async () => { + expect(() => + evaluate({ + // biome-ignore lint/suspicious/noExplicitAny: intentionally testing invalid input + tests: [{ id: 'no-input', assert: [{ type: 'contains', value: 'x' }] } as any], + target: { name: 'default', provider: 'mock', response: 'hello' }, + }), + ).toThrow("Test 'no-input': input is required for non-conversation tests"); + }); +});