Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions examples/features/sdk-programmatic-api-advanced/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# SDK Programmatic API — Advanced

Demonstrates the advanced programmatic API features added in [#1115](https://github.com/anthropics/agentv/issues/1115):

- **`beforeAll`** — run setup commands before the suite starts
- **`budgetUsd`** — cap total LLM spend
- **`turns`** — multi-turn conversation evaluation
- **`aggregation`** — control how turn scores combine (`mean`, `min`, `max`)

## Run

```bash
bun run evaluate.ts
```

See also: [`sdk-programmatic-api`](../sdk-programmatic-api/) for the basic API.
56 changes: 56 additions & 0 deletions examples/features/sdk-programmatic-api-advanced/evaluate.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
/**
* Advanced Programmatic API Example
*
* Demonstrates evaluate() with beforeAll, budgetUsd, multi-turn conversations,
* and aggregation — all defined in TypeScript with full type safety.
*
* Run: bun run evaluate.ts
*/
import { evaluate } from '@agentv/core';

const { results, summary } = await evaluate({
// Run a setup command before the suite starts
beforeAll: 'echo "Setting up test environment"',

// Cap total LLM spend at $5
budgetUsd: 5.0,

tests: [
// Standard single-turn test (unchanged from basic API)
{
id: 'greeting',
input: 'Say hello.',
assert: [{ type: 'contains', value: 'Hello' }],
},

// Multi-turn conversation test
{
id: 'multi-turn-memory',
mode: 'conversation',
turns: [
{
input: 'Hi, my name is Alice.',
assert: [{ type: 'contains', value: 'Alice' }],
},
{
input: 'What is my name?',
expectedOutput: 'Your name is Alice.',
assert: [{ type: 'contains', value: 'Alice' }],
},
],
// Use weakest-link scoring: final score = lowest turn score
aggregation: 'min',
},
],

onResult: (result) => {
console.log(` ${result.testId}: score=${result.score.toFixed(2)}`);
},
});

console.log('\n--- Summary ---');
console.log(`Total: ${summary.total}`);
console.log(`Passed: ${summary.passed}`);
console.log(`Failed: ${summary.failed}`);
console.log(`Mean score: ${summary.meanScore.toFixed(2)}`);
console.log(`Duration: ${summary.durationMs}ms`);
8 changes: 8 additions & 0 deletions examples/features/sdk-programmatic-api-advanced/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"name": "agentv-example-sdk-programmatic-api-advanced",
"private": true,
"type": "module",
"dependencies": {
"@agentv/core": "file:../../../packages/core"
}
}
145 changes: 118 additions & 27 deletions packages/core/src/evaluation/evaluate.ts
Original file line number Diff line number Diff line change
Expand Up @@ -69,10 +69,13 @@ import { type ResolvedTarget, resolveTargetDefinition } from './providers/target
import type { TargetDefinition } from './providers/types.js';
import { INLINE_ASSERT_FN } from './registry/builtin-graders.js';
import type {
ConversationAggregation,
ConversationTurn,
EvalTest,
EvaluationResult,
GraderConfig,
InlineAssertEvaluatorConfig,
WorkspaceHookConfig,
} from './types.js';
import { loadTests } from './yaml-parser.js';

Expand All @@ -85,8 +88,8 @@ export interface EvalTestInput {
readonly id: string;
/** What the response should accomplish */
readonly criteria?: string;
/** Input to the agent (string or message array) */
readonly input: string | readonly { role: string; content: string }[];
/** Input to the agent (string or message array). Omit when using turns[]. */
readonly input?: string | readonly { role: string; content: string }[];
/** Expected reference output (camelCase preferred) */
readonly expectedOutput?: string;
/** @deprecated Use `expectedOutput` instead */
Expand All @@ -95,6 +98,27 @@ export interface EvalTestInput {
readonly assert?: readonly AssertEntry[];
/** Arbitrary metadata */
readonly metadata?: Record<string, unknown>;
/** Enable multi-turn conversation mode. Inferred automatically when turns[] is provided. */
readonly mode?: 'conversation';
/** Ordered turns for conversation evaluation. Each turn generates a fresh LLM call. */
readonly turns?: readonly ConversationTurnInput[];
/** Score aggregation across turns: 'mean' (default), 'min', or 'max'. */
readonly aggregation?: ConversationAggregation;
}

/**
* A single turn in a multi-turn conversation evaluation (programmatic API).
* Mirrors the YAML `turns` structure with camelCase naming.
*/
export interface ConversationTurnInput {
/** Input for this turn (string or message array) */
readonly input: string | readonly { role: string; content: string }[];
/** Expected reference output for this turn */
readonly expectedOutput?: string;
/** @deprecated Use `expectedOutput` instead */
readonly expected_output?: string;
/** Per-turn assertions (string criteria or grader config) */
readonly assert?: readonly AssertEntry[];
}

/**
Expand Down Expand Up @@ -162,6 +186,10 @@ export interface EvalConfig {
readonly onResult?: (result: EvaluationResult) => void;
/** Score threshold for pass/fail (0-1). Default: 0.8 (DEFAULT_THRESHOLD). */
readonly threshold?: number;
/** Command(s) to run once before the suite starts. Same semantics as YAML before_all. */
readonly beforeAll?: string | readonly string[];
/** Suite-level cost cap in USD. Stops dispatching new tests when exceeded. */
readonly budgetUsd?: number;
}

/**
Expand Down Expand Up @@ -279,17 +307,27 @@ export async function evaluate(config: EvalConfig): Promise<EvalRunResult> {
filter: config.filter,
});
} else {
// Build workspace config with before_all hook if beforeAll is provided
const suiteWorkspace = config.beforeAll
? { hooks: { before_all: toBeforeAllHook(config.beforeAll) } }
: undefined;

// Inline mode: convert EvalTestInput[] to EvalTest[]
evalCases = (config.tests ?? []).map((test): EvalTest => {
const input =
typeof test.input === 'string'
? ([{ role: 'user' as const, content: test.input }] as EvalTest['input'])
: (test.input as unknown as EvalTest['input']);
// Conversation mode: use turns[] for input/question derivation
const isConversation = test.mode === 'conversation' || (test.turns && test.turns.length > 0);

if (!isConversation && !test.input) {
throw new Error(`Test '${test.id}': input is required for non-conversation tests`);
}

const input = isConversation
? toMessageArray(test.turns?.[0]?.input ?? '')
: toMessageArray(test.input ?? '');

const question =
typeof test.input === 'string'
? test.input
: (test.input.find((m) => m.role === 'user')?.content ?? '');
const question = isConversation
? extractQuestion(test.turns?.[0]?.input ?? '')
: extractQuestion(test.input ?? '');

const expectedOutputValue = test.expectedOutput ?? test.expected_output;
const expectedOutput = expectedOutputValue
Expand All @@ -300,24 +338,19 @@ export async function evaluate(config: EvalConfig): Promise<EvalRunResult> {

// Convert inline assertions to evaluator config format
const allAssertions = [...(test.assert ?? []), ...(config.assert ?? [])];
const assertConfigs = allAssertions.map((entry, i) => {
if (typeof entry === 'function') {
// Wrap AssertFn as InlineAssertEvaluatorConfig with function attached via Symbol
const base: InlineAssertEvaluatorConfig = {
type: 'inline-assert',
name: `inline-assert-${i}`,
};
return Object.assign(base, {
[INLINE_ASSERT_FN]: entry as AssertFn,
}) as unknown as GraderConfig;
}
const a = entry as EvalAssertionInput;
const { type: rawType, ...rest } = a;
const assertConfigs = convertAssertions(allAssertions);

// Convert conversation turns if present — keep input/expected_output as
// TestMessageContent (matching YAML parser behavior), not wrapped in message arrays.
const turns: ConversationTurn[] | undefined = test.turns?.map((turn) => {
const turnExpected = turn.expectedOutput ?? turn.expected_output;
return {
...rest,
name: a.name ?? `${rawType}_${i}`,
type: mapAssertionType(rawType),
} as unknown as GraderConfig;
input: turn.input as ConversationTurn['input'],
...(turnExpected !== undefined && {
expected_output: turnExpected as ConversationTurn['expected_output'],
}),
assertions: turn.assert ? convertAssertions([...turn.assert]) : undefined,
};
});

return {
Expand All @@ -330,6 +363,10 @@ export async function evaluate(config: EvalConfig): Promise<EvalRunResult> {
file_paths: [],
assertions: assertConfigs.length > 0 ? assertConfigs : undefined,
metadata: test.metadata,
...(suiteWorkspace && { workspace: suiteWorkspace }),
...(isConversation && { mode: 'conversation' as const }),
...(turns && { turns }),
...(test.aggregation && { aggregation: test.aggregation }),
};
});
}
Expand All @@ -348,6 +385,7 @@ export async function evaluate(config: EvalConfig): Promise<EvalRunResult> {
filter: config.filter,
threshold: config.threshold,
evalCases,
...(config.budgetUsd !== undefined && { budgetUsd: config.budgetUsd }),
onResult: async (result) => {
collectedResults.push(result);
config.onResult?.(result);
Expand All @@ -363,6 +401,59 @@ export async function evaluate(config: EvalConfig): Promise<EvalRunResult> {
};
}

/**
* Convert a flexible input (string or message array) to the internal TestMessage[] format.
*/
function toMessageArray(
input: string | readonly { role: string; content: string }[],
): EvalTest['input'] {
if (typeof input === 'string') {
return [{ role: 'user' as const, content: input }] as EvalTest['input'];
}
return input as unknown as EvalTest['input'];
}

/**
* Extract the user-facing question string from a flexible input.
*/
function extractQuestion(input: string | readonly { role: string; content: string }[]): string {
if (typeof input === 'string') return input;
return input.find((m) => m.role === 'user')?.content ?? '';
}

/**
* Convert programmatic API beforeAll (string | string[]) to internal WorkspaceHookConfig.
* Accepts a shell command string or an array of command tokens.
*/
function toBeforeAllHook(beforeAll: string | readonly string[]): WorkspaceHookConfig {
const command = typeof beforeAll === 'string' ? ['sh', '-c', beforeAll] : [...beforeAll];
return { command };
}

/**
* Convert an array of assert entries (inline functions or config objects) to GraderConfig[].
*/
function convertAssertions(entries: readonly AssertEntry[]): GraderConfig[] {
return entries.map((entry, i) => {
if (typeof entry === 'function') {
const base: InlineAssertEvaluatorConfig = {
type: 'inline-assert',
name: `inline-assert-${i}`,
};
return Object.assign(base, {
[INLINE_ASSERT_FN]: entry as AssertFn,
}) as unknown as GraderConfig;
}
const a = entry as EvalAssertionInput;
const { type: rawType, ...rest } = a;
return {
...rest,
name: a.name ?? `${rawType}_${i}`,
type: mapAssertionType(rawType),
} as unknown as GraderConfig;
});
}

/**
* Map user-facing assertion type names to internal grader type names.
* Handles snake_case to kebab-case normalization (e.g., 'llm_grader' -> 'llm-grader').
Expand Down
1 change: 1 addition & 0 deletions packages/core/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ export * from './evaluation/orchestrator.js';
export {
evaluate,
type AssertEntry,
type ConversationTurnInput,
type EvalConfig,
type EvalTestInput,
type EvalAssertionInput,
Expand Down
Loading
Loading