Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 1 addition & 28 deletions apps/cli/test/commands/eval/pipeline/bench.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,7 @@ describe('pipeline bench', () => {
await rm(OUT_DIR, { recursive: true, force: true });
});

it('writes grading.json with merged scores and pass_rate', async () => {
// Write LLM grader result to disk (the default flow)
it('writes grading, index, and benchmark artifacts', async () => {
await writeFile(
join(OUT_DIR, 'test-01', 'llm_grader_results', 'relevance.json'),
JSON.stringify({
Expand All @@ -76,19 +75,6 @@ describe('pipeline bench', () => {
expect(grading.summary.pass_rate).toBeGreaterThan(0);
expect(grading.assertions.length).toBeGreaterThan(0);
expect(grading.graders).toHaveLength(2);
}, 30_000);

it('writes index.jsonl with one entry per test', async () => {
await writeFile(
join(OUT_DIR, 'test-01', 'llm_grader_results', 'relevance.json'),
JSON.stringify({
score: 0.8,
assertions: [{ text: 'Relevant', passed: true }],
}),
);

const { execa } = await import('execa');
await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', OUT_DIR]);

const indexContent = await readFile(join(OUT_DIR, 'index.jsonl'), 'utf8');
const lines = indexContent
Expand All @@ -98,19 +84,6 @@ describe('pipeline bench', () => {
expect(lines).toHaveLength(1);
expect(lines[0].test_id).toBe('test-01');
expect(lines[0].score).toBeGreaterThan(0);
}, 30_000);

it('writes benchmark.json with run_summary', async () => {
await writeFile(
join(OUT_DIR, 'test-01', 'llm_grader_results', 'relevance.json'),
JSON.stringify({
score: 0.8,
assertions: [{ text: 'ok', passed: true }],
}),
);

const { execa } = await import('execa');
await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', OUT_DIR]);

const benchmark = JSON.parse(await readFile(join(OUT_DIR, 'benchmark.json'), 'utf8'));
expect(benchmark.metadata.targets).toContain('test-target');
Expand Down
47 changes: 12 additions & 35 deletions apps/cli/test/commands/eval/pipeline/grade.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ describe('pipeline grade', () => {
await rm(OUT_DIR, { recursive: true, force: true });
});

it('writes code_grader_results/<name>.json with score', async () => {
it('writes code_grader_results/<name>.json with score and assertions', async () => {
const { execa } = await import('execa');
await execa('bun', [CLI_ENTRY, 'pipeline', 'grade', OUT_DIR]);

Expand All @@ -55,15 +55,6 @@ describe('pipeline grade', () => {
);
expect(result.score).toBe(1);
expect(result.name).toBe('always_pass');
}, 30_000);

it('includes assertions from code grader output', async () => {
const { execa } = await import('execa');
await execa('bun', [CLI_ENTRY, 'pipeline', 'grade', OUT_DIR]);

const result = JSON.parse(
await readFile(join(OUT_DIR, 'test-01', 'code_grader_results', 'always_pass.json'), 'utf8'),
);
expect(result.assertions).toHaveLength(1);
expect(result.assertions[0].passed).toBe(true);
}, 30_000);
Expand All @@ -83,7 +74,6 @@ describe('pipeline grade — builtin assertions', () => {
JSON.stringify({ input: [{ role: 'user', content: 'say hello' }] }),
);

// contains assertion — should pass
await writeFile(
join(builtinGradersDir, 'has_hello.json'),
JSON.stringify({
Expand All @@ -95,7 +85,6 @@ describe('pipeline grade — builtin assertions', () => {
}),
);

// regex assertion — should pass
await writeFile(
join(builtinGradersDir, 'matches_pattern.json'),
JSON.stringify({
Expand All @@ -107,7 +96,6 @@ describe('pipeline grade — builtin assertions', () => {
}),
);

// contains assertion — should fail
await writeFile(
join(builtinGradersDir, 'has_goodbye.json'),
JSON.stringify({
Expand All @@ -134,48 +122,37 @@ describe('pipeline grade — builtin assertions', () => {
await rm(BUILTIN_OUT, { recursive: true, force: true });
});

it('evaluates contains assertion and writes result', async () => {
it('evaluates builtin assertions and writes results', async () => {
const { execa } = await import('execa');
await execa('bun', [CLI_ENTRY, 'pipeline', 'grade', BUILTIN_OUT]);

const result = JSON.parse(
const containsResult = JSON.parse(
await readFile(join(BUILTIN_OUT, 'test-01', 'code_grader_results', 'has_hello.json'), 'utf8'),
);
expect(result.score).toBe(1);
expect(result.type).toBe('contains');
expect(result.assertions[0].passed).toBe(true);
}, 30_000);

it('evaluates regex assertion and writes result', async () => {
const { execa } = await import('execa');
await execa('bun', [CLI_ENTRY, 'pipeline', 'grade', BUILTIN_OUT]);
expect(containsResult.score).toBe(1);
expect(containsResult.type).toBe('contains');
expect(containsResult.assertions[0].passed).toBe(true);

const result = JSON.parse(
const regexResult = JSON.parse(
await readFile(
join(BUILTIN_OUT, 'test-01', 'code_grader_results', 'matches_pattern.json'),
'utf8',
),
);
expect(result.score).toBe(1);
expect(result.type).toBe('regex');
}, 30_000);
expect(regexResult.score).toBe(1);
expect(regexResult.type).toBe('regex');

it('scores 0 when contains assertion does not match', async () => {
const { execa } = await import('execa');
await execa('bun', [CLI_ENTRY, 'pipeline', 'grade', BUILTIN_OUT]);

const result = JSON.parse(
const failingContainsResult = JSON.parse(
await readFile(
join(BUILTIN_OUT, 'test-01', 'code_grader_results', 'has_goodbye.json'),
'utf8',
),
);
expect(result.score).toBe(0);
expect(result.assertions[0].passed).toBe(false);
expect(failingContainsResult.score).toBe(0);
expect(failingContainsResult.assertions[0].passed).toBe(false);
}, 30_000);

it('applies negate to invert score', async () => {
// Overwrite has_goodbye with negate: true — "not contains goodbye" should pass
await writeFile(
join(BUILTIN_OUT, 'test-01', 'code_graders', 'has_goodbye.json'),
JSON.stringify({
Expand Down
48 changes: 8 additions & 40 deletions apps/cli/test/commands/eval/pipeline/input.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,65 +12,41 @@ describe('pipeline input', () => {
await rm(OUT_DIR, { recursive: true, force: true });
});

it('writes manifest.json with test_ids and eval_file', async () => {
it('materializes the default input workspace', async () => {
const { execa } = await import('execa');
await execa('bun', [CLI_ENTRY, 'pipeline', 'input', EVAL_PATH, '--out', OUT_DIR]);

const manifest = JSON.parse(await readFile(join(OUT_DIR, 'manifest.json'), 'utf8'));
expect(manifest.test_ids).toEqual(['test-01']);
expect(manifest.eval_file).toContain('input-test.eval.yaml');
}, 30_000);

it('writes per-test input.json with input and input_files', async () => {
const { execa } = await import('execa');
await execa('bun', [CLI_ENTRY, 'pipeline', 'input', EVAL_PATH, '--out', OUT_DIR]);
expect(manifest.experiment).toBeUndefined();

const input = JSON.parse(
await readFile(join(OUT_DIR, 'input-test', 'test-01', 'input.json'), 'utf8'),
);
expect(input.input).toHaveLength(1);
expect(input.input[0].content).toBe('hello world');
}, 30_000);

it('writes code_graders/<name>.json with resolved command', async () => {
const { execa } = await import('execa');
await execa('bun', [CLI_ENTRY, 'pipeline', 'input', EVAL_PATH, '--out', OUT_DIR]);

const grader = JSON.parse(
const codeGrader = JSON.parse(
await readFile(
join(OUT_DIR, 'input-test', 'test-01', 'code_graders', 'contains_hello.json'),
'utf8',
),
);
expect(grader.command).toBeDefined();
expect(grader.name).toBe('contains_hello');
}, 30_000);
expect(codeGrader.command).toBeDefined();
expect(codeGrader.name).toBe('contains_hello');

it('writes llm_graders/<name>.json with prompt content', async () => {
const { execa } = await import('execa');
await execa('bun', [CLI_ENTRY, 'pipeline', 'input', EVAL_PATH, '--out', OUT_DIR]);

const grader = JSON.parse(
const llmGrader = JSON.parse(
await readFile(
join(OUT_DIR, 'input-test', 'test-01', 'llm_graders', 'relevance.json'),
'utf8',
),
);
expect(grader.prompt_content).toBeDefined();
expect(grader.name).toBe('relevance');
}, 30_000);

it('writes criteria.md', async () => {
const { execa } = await import('execa');
await execa('bun', [CLI_ENTRY, 'pipeline', 'input', EVAL_PATH, '--out', OUT_DIR]);
expect(llmGrader.prompt_content).toBeDefined();
expect(llmGrader.name).toBe('relevance');

const criteria = await readFile(join(OUT_DIR, 'input-test', 'test-01', 'criteria.md'), 'utf8');
expect(criteria).toContain('Response echoes the input');
}, 30_000);

it('writes invoke.json', async () => {
const { execa } = await import('execa');
await execa('bun', [CLI_ENTRY, 'pipeline', 'input', EVAL_PATH, '--out', OUT_DIR]);

const invoke = JSON.parse(
await readFile(join(OUT_DIR, 'input-test', 'test-01', 'invoke.json'), 'utf8'),
Expand All @@ -95,14 +71,6 @@ describe('pipeline input', () => {
expect(manifest.experiment).toBe('without_skills');
}, 30_000);

it('omits experiment from manifest when --experiment is not provided', async () => {
const { execa } = await import('execa');
await execa('bun', [CLI_ENTRY, 'pipeline', 'input', EVAL_PATH, '--out', OUT_DIR]);

const manifest = JSON.parse(await readFile(join(OUT_DIR, 'manifest.json'), 'utf8'));
expect(manifest.experiment).toBeUndefined();
}, 30_000);

it('writes code_graders/<name>.json for deterministic assertions', async () => {
const { execa } = await import('execa');
const builtinEvalPath = join(FIXTURE_DIR, 'builtin-test.eval.yaml');
Expand Down
31 changes: 1 addition & 30 deletions apps/cli/test/commands/results/serve.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -563,15 +563,12 @@ describe('serve app', () => {
});
});

it('computes pass_rate using the configured dashboard threshold (strict threshold yields lower rate)', async () => {
it('computes pass_rate using the configured dashboard threshold', async () => {
const runsDir = path.join(tempDir, '.agentv', 'results', 'runs');
mkdirSync(runsDir, { recursive: true });
const filename = '2026-03-25T10-00-00-000Z';
const runDir = path.join(runsDir, filename);
mkdirSync(runDir, { recursive: true });
// Two results: score=0.8 and score=0.6
// With DEFAULT_THRESHOLD=0.8: score=0.8 passes → 1/2 = 50%
// With threshold=0.9: neither passes → 0%
const resultHigh = { ...RESULT_A, test_id: 'high', score: 0.8 };
const resultLow = { ...RESULT_B, test_id: 'low', score: 0.6 };
writeFileSync(path.join(runDir, 'index.jsonl'), toJsonl(resultHigh, resultLow));
Expand All @@ -584,35 +581,9 @@ describe('serve app', () => {
expect(res.status).toBe(200);
const data = (await res.json()) as { runs: Array<{ pass_rate: number }> };
expect(data.runs).toHaveLength(1);
// With threshold=0.9: neither 0.8 nor 0.6 passes → 0%
expect(data.runs[0].pass_rate).toBe(0);
});

it('computes pass_rate using the configured dashboard threshold (lenient threshold yields higher rate)', async () => {
const runsDir = path.join(tempDir, '.agentv', 'results', 'runs');
mkdirSync(runsDir, { recursive: true });
const filename = '2026-03-25T12-00-00-000Z';
const runDir = path.join(runsDir, filename);
mkdirSync(runDir, { recursive: true });
// Two results: score=0.8 and score=0.6
// With DEFAULT_THRESHOLD=0.8: score=0.8 passes → 1/2 = 50%
// With threshold=0.5: both pass → 2/2 = 100%
const resultHigh = { ...RESULT_A, test_id: 'high', score: 0.8 };
const resultLow = { ...RESULT_B, test_id: 'low', score: 0.6 };
writeFileSync(path.join(runDir, 'index.jsonl'), toJsonl(resultHigh, resultLow));

mkdirSync(path.join(tempDir, '.agentv'), { recursive: true });
writeFileSync(path.join(tempDir, '.agentv', 'config.yaml'), 'dashboard:\n threshold: 0.5\n');

const app = createApp([], tempDir, tempDir, undefined, { studioDir });
const res = await app.request('/api/runs');
expect(res.status).toBe(200);
const data = (await res.json()) as { runs: Array<{ pass_rate: number }> };
expect(data.runs).toHaveLength(1);
// With threshold=0.5: both 0.8 and 0.6 pass → 100%
expect(data.runs[0].pass_rate).toBe(1);
});

it('infers the experiment name from the run id when live results have not written it yet', async () => {
const runsDir = path.join(tempDir, '.agentv', 'results', 'runs', 'issue-1198-live-name');
mkdirSync(runsDir, { recursive: true });
Expand Down
Loading
Loading