Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion apps/web/src/content/docs/docs/evaluation/eval-cases.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ tests:
```

The `metadata` field is included in the stdin JSON passed to lifecycle commands as `case_metadata`.
Operational checkout state belongs under `workspace.repos[].checkout.base_commit`; `metadata.base_commit` is informational only. `workspace.docker.base_commit` is retained as a deprecated compatibility bridge for legacy Docker-backed evals.
Operational checkout state belongs under `workspace.repos[].checkout.base_commit`; `metadata.base_commit` is informational only.

## Per-Test Assertions

Expand Down
2 changes: 1 addition & 1 deletion apps/web/src/content/docs/docs/tools/import.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ uv run scripts/import-huggingface.py \
Each instance becomes an EVAL.yaml with:
- `input` — the problem statement
- `workspace.docker.image` — the pre-built SWE-bench Docker image (`ghcr.io/epoch-research/swe-bench.eval.x86_64.<instance_id>:latest`)
- `workspace.docker.base_commit` — the commit to reset to before the agent runs
- `workspace.repos[].checkout.base_commit` — the commit to reset to before the agent runs
- `assertions` — `code-grader` tasks that run `FAIL_TO_PASS` and `PASS_TO_PASS` pytest suites inside the container

Run an imported SWE-bench eval against any coding agent target:
Expand Down
14 changes: 13 additions & 1 deletion examples/features/docker-workspace/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,19 @@ workspace:
cpus: 2 # optional Docker CPU limit
```

For evals that need a repo pinned to a dataset snapshot, prefer `workspace.repos[].checkout.base_commit`. `workspace.docker.base_commit` still works as a compatibility bridge for existing Docker-backed SWE-bench configs, but new configs should keep checkout state in the repo model rather than in the Docker block.
For evals that need a repo pinned to a dataset snapshot, use `workspace.repos[].checkout.base_commit`:

```yaml
workspace:
docker:
image: swebench/sweb.eval.x86_64.django__django-15180
repos:
- path: /testbed
checkout:
base_commit: abc123def
```

Repos defined without `source` are assumed to already exist inside the container (e.g., SWE-bench prebuilt images).

## Running

Expand Down
8 changes: 5 additions & 3 deletions packages/core/src/evaluation/orchestrator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -527,8 +527,9 @@ export async function runEvaluation(
for (const ec of filteredEvalCases) {
if (ec.workspace?.repos) {
for (const repo of ec.workspace.repos) {
// Deduplicate by repo path + source path
const key = `${repo.path}::${repo.source.type === 'local' ? repo.source.path : ''}`;
// Deduplicate by repo path + source path (skip source-less Docker repos)
if (!repo.source) continue;
const key = `${repo.path ?? ''}::${repo.source.type === 'local' ? repo.source.path : ''}`;
if (!allRepos.has(key)) {
allRepos.set(key, repo);
}
Expand All @@ -543,7 +544,7 @@ export async function runEvaluation(
// Store invalid repo paths so affected tests can be failed with execution_error
const invalidLocalRepoPaths = new Set(localPathErrors.map((e) => e.repoPath));
// If suite-level repos have invalid paths, fail the entire run early
if (suiteWorkspace?.repos?.some((r) => invalidLocalRepoPaths.has(r.path))) {
if (suiteWorkspace?.repos?.some((r) => r.path && invalidLocalRepoPaths.has(r.path))) {
throw new Error(message);
}
}
Expand Down Expand Up @@ -735,6 +736,7 @@ export async function runEvaluation(
if (needsPerRepoCheck) {
// Static workspace with existing content: materialize only missing repos
for (const repo of suiteWorkspace.repos) {
if (!repo.path || !repo.source) continue;
const targetDir = path.join(sharedWorkspacePath, repo.path);
if (existsSync(targetDir)) {
setupLog(`reusing existing repo at: ${targetDir}`);
Expand Down
8 changes: 4 additions & 4 deletions packages/core/src/evaluation/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -245,8 +245,10 @@ export type RepoClone = {
};

export type RepoConfig = {
readonly path: string;
readonly source: RepoSource;
/** Target path inside the workspace. Optional for Docker repos targeting the container's working directory. */
readonly path?: string;
/** Clone source. Optional for Docker prebuilt images where repos exist inside the container. */
readonly source?: RepoSource;
readonly checkout?: RepoCheckout;
readonly clone?: RepoClone;
};
Expand Down Expand Up @@ -292,8 +294,6 @@ export type DockerWorkspaceConfig = {
readonly memory?: string;
/** CPU limit (e.g. 2, 0.5) */
readonly cpus?: number;
/** @deprecated Prefer workspace.repos[].checkout.base_commit as the checkout source of truth */
readonly base_commit?: string;
};

export type WorkspaceConfig = {
Expand Down
5 changes: 2 additions & 3 deletions packages/core/src/evaluation/validation/eval-file.schema.ts
Original file line number Diff line number Diff line change
Expand Up @@ -283,8 +283,8 @@ const RepoCloneSchema = z.object({
});

const RepoSchema = z.object({
path: z.string(),
source: RepoSourceSchema,
path: z.string().optional(),
source: RepoSourceSchema.optional(),
checkout: RepoCheckoutSchema.optional(),
clone: RepoCloneSchema.optional(),
});
Expand All @@ -311,7 +311,6 @@ const DockerWorkspaceSchema = z.object({
timeout: z.number().int().min(1).optional(),
memory: z.string().optional(),
cpus: z.number().min(0.1).optional(),
base_commit: z.string().min(1).optional(),
});

const WorkspaceSchema = z
Expand Down
29 changes: 16 additions & 13 deletions packages/core/src/evaluation/validation/eval-validator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,8 @@ function validateWorkspaceRepoConfig(
const afterEachHook = isObject(hooks) ? hooks.after_each : undefined;
const isolation = workspace.isolation;

const docker = workspace.docker;

// Depth vs ancestor warning
if (Array.isArray(repos)) {
for (const repo of repos) {
Expand All @@ -410,14 +412,26 @@ function validateWorkspaceRepoConfig(
const checkout = repo.checkout;
const clone = repo.clone;

// Source-less repos are only valid with Docker (repo exists inside container)
if (!isObject(source) && !isObject(docker)) {
errors.push({
severity: 'error',
filePath,
location: `workspace.repos[path=${repo.path ?? '(none)'}]`,
message:
'repos[].source is required for non-Docker workspaces. ' +
'Source-less repos are only valid when workspace.docker is configured (repo exists inside the container).',
});
}

if (isObject(source) && isObject(checkout)) {
const sourceType = source.type;
const resolve = checkout.resolve;
if (sourceType === 'local' && typeof resolve === 'string') {
errors.push({
severity: 'warning',
filePath,
location: `workspace.repos[path=${repo.path}]`,
location: `workspace.repos[path=${repo.path ?? '(none)'}]`,
message:
'checkout.resolve has no effect for a local source. ' +
'Use source.type to choose where the repo comes from; keep checkout.ref, checkout.base_commit, or checkout.ancestor only when pinning a local source.',
Expand All @@ -432,7 +446,7 @@ function validateWorkspaceRepoConfig(
errors.push({
severity: 'warning',
filePath,
location: `workspace.repos[path=${repo.path}]`,
location: `workspace.repos[path=${repo.path ?? '(none)'}]`,
message:
`clone.depth (${depth}) may be insufficient for checkout.ancestor (${ancestor}). ` +
`Recommend depth >= ${ancestor + 1}.`,
Expand All @@ -454,17 +468,6 @@ function validateWorkspaceRepoConfig(
}
}

const docker = workspace.docker;
if (isObject(docker) && typeof docker.base_commit === 'string') {
errors.push({
severity: 'warning',
filePath,
location: 'workspace.docker.base_commit',
message:
'workspace.docker.base_commit is deprecated. Prefer workspace.repos[].checkout.base_commit so checkout state remains backend-agnostic.',
});
}

// after_each reset with per_test isolation warning
if (isObject(afterEachHook) && afterEachHook.reset && isolation === 'per_test') {
errors.push({
Expand Down
2 changes: 1 addition & 1 deletion packages/core/src/evaluation/workspace/deps-scanner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ export async function scanRepoDeps(evalFilePaths: readonly string[]): Promise<De
try {
const repos = await extractReposFromEvalFile(filePath);
for (const repo of repos) {
if (repo.source.type !== 'git') continue;
if (!repo.source || repo.source.type !== 'git') continue;
const ref = repo.checkout?.ref;
const key = `${normalizeGitUrl(repo.source.url)}\0${ref ?? ''}`;
const existing = seen.get(key);
Expand Down
13 changes: 3 additions & 10 deletions packages/core/src/evaluation/workspace/docker-workspace.ts
Original file line number Diff line number Diff line change
Expand Up @@ -167,26 +167,19 @@ export class DockerWorkspaceProvider {
}

/**
* Reset the container checkout to the configured base commit, if present.
* Reset the container checkout to the specified target refs, if any.
* This is used for SWE-bench images where the repo state must match the
* dataset's base snapshot before grading begins.
*/
async resetContainerCheckout(
containerId: string,
repoCheckouts?: readonly RepoCheckoutTarget[],
): Promise<void> {
const checkoutTargets =
repoCheckouts && repoCheckouts.length > 0
? repoCheckouts
: this.config.base_commit
? [{ ref: this.config.base_commit }]
: [];

if (checkoutTargets.length === 0) {
if (!repoCheckouts || repoCheckouts.length === 0) {
return;
}

for (const target of checkoutTargets) {
for (const target of repoCheckouts) {
const resetResult = await this.execInContainer({
containerId,
command: buildGitCommand(target, ['reset', '--hard', target.ref]),
Expand Down
45 changes: 27 additions & 18 deletions packages/core/src/evaluation/workspace/pool-manager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -67,16 +67,20 @@ interface PoolMetadata {
* Git URLs are lowercased with .git suffix stripped; local paths are kept as-is.
*/
function normalizeRepoForFingerprint(repo: RepoConfig): Record<string, unknown> {
const source =
repo.source.type === 'git'
? { type: 'git', url: repo.source.url.toLowerCase().replace(/\.git$/, '') }
: { type: 'local', path: repo.source.path };

const result: Record<string, unknown> = {
path: repo.path,
source,
ref: getRepoCheckoutRef(repo.checkout),
};
const result: Record<string, unknown> = {};

if (repo.path) {
result.path = repo.path;
}

if (repo.source) {
result.source =
repo.source.type === 'git'
? { type: 'git', url: repo.source.url.toLowerCase().replace(/\.git$/, '') }
: { type: 'local', path: repo.source.path };
}

result.ref = getRepoCheckoutRef(repo.checkout);

if (repo.clone?.depth !== undefined) {
result.depth = repo.clone.depth;
Expand All @@ -99,7 +103,9 @@ function normalizeRepoForFingerprint(repo: RepoConfig): Record<string, unknown>
*/
export function computeWorkspaceFingerprint(repos: readonly RepoConfig[]): string {
const canonical = {
repos: [...repos].sort((a, b) => a.path.localeCompare(b.path)).map(normalizeRepoForFingerprint),
repos: [...repos]
.sort((a, b) => (a.path ?? '').localeCompare(b.path ?? ''))
.map(normalizeRepoForFingerprint),
};

return createHash('sha256').update(JSON.stringify(canonical)).digest('hex');
Expand Down Expand Up @@ -364,8 +370,9 @@ export class WorkspacePoolManager {
repos: readonly RepoConfig[],
poolReset: 'none' | 'fast' | 'strict' = 'fast',
): Promise<void> {
// Reset each repo
// Reset each repo (skip source-less repos — they live inside Docker only)
for (const repo of repos) {
if (!repo.path || !repo.source) continue;
const repoDir = path.join(slotPath, repo.path);
if (!existsSync(repoDir)) {
continue;
Expand Down Expand Up @@ -398,12 +405,14 @@ export class WorkspacePoolManager {
// Re-copy template files, skipping repo directories
if (templatePath) {
const repoDirNames = new Set(
repos.map((r) => {
// Get the top-level directory name from the repo path
// e.g., './my-repo' -> 'my-repo', 'repos/foo' -> 'repos'
const normalized = r.path.replace(/^\.\//, '');
return normalized.split('/')[0];
}),
repos
.filter((r) => r.path)
.map((r) => {
// Get the top-level directory name from the repo path
// e.g., './my-repo' -> 'my-repo', 'repos/foo' -> 'repos'
const normalized = (r.path ?? '').replace(/^\.\//, '');
return normalized.split('/')[0];
}),
);
await copyDirectoryRecursive(templatePath, slotPath, repoDirNames);
}
Expand Down
7 changes: 4 additions & 3 deletions packages/core/src/evaluation/workspace/repo-config-parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -56,12 +56,13 @@ export function parseRepoConfig(raw: unknown): RepoConfig | undefined {
const obj = raw as Record<string, unknown>;
const repoPath = typeof obj.path === 'string' ? obj.path : undefined;
const source = parseRepoSource(obj.source);
if (!repoPath || !source) return undefined;
const checkout = parseRepoCheckout(obj.checkout);
const clone = parseRepoClone(obj.clone);
// At least one meaningful field must be present
if (!repoPath && !source && !checkout && !clone) return undefined;
return {
path: repoPath,
source,
...(repoPath !== undefined && { path: repoPath }),
...(source !== undefined && { source }),
...(checkout !== undefined && { checkout }),
...(clone !== undefined && { clone }),
};
Expand Down
24 changes: 17 additions & 7 deletions packages/core/src/evaluation/workspace/repo-manager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -63,18 +63,18 @@ export class RepoManager {
static validateLocalPaths(repos: readonly RepoConfig[]): readonly LocalPathValidationError[] {
const errors: LocalPathValidationError[] = [];
for (const repo of repos) {
if (repo.source.type !== 'local') continue;
if (!repo.source || repo.source.type !== 'local') continue;

const sourcePath = repo.source.path;
if (!sourcePath || sourcePath.trim() === '') {
errors.push({
repoPath: repo.path,
repoPath: repo.path ?? '(none)',
resolvedSourcePath: sourcePath ?? '',
reason: 'empty_path',
});
} else if (!existsSync(sourcePath)) {
errors.push({
repoPath: repo.path,
repoPath: repo.path ?? '(none)',
resolvedSourcePath: sourcePath,
reason: 'not_found',
});
Expand Down Expand Up @@ -124,6 +124,12 @@ export class RepoManager {
* Handles checkout, ref resolution, ancestor walking, shallow clone, sparse checkout.
*/
async materialize(repo: RepoConfig, workspacePath: string): Promise<void> {
if (!repo.source || !repo.path) {
if (this.verbose) {
console.log(`[repo] materialize skip path=${repo.path ?? '(none)'} (no source or path)`);
}
return;
}
const targetDir = path.join(workspacePath, repo.path);
const sourceUrl = getSourceUrl(repo.source);
const startedAt = Date.now();
Expand Down Expand Up @@ -225,27 +231,31 @@ export class RepoManager {
}
}

/** Materialize all repos into the workspace. */
/** Materialize all repos into the workspace. Skips repos without source (Docker-only repos). */
async materializeAll(repos: readonly RepoConfig[], workspacePath: string): Promise<void> {
const materializableRepos = repos.filter((r) => r.source);
if (this.verbose) {
console.log(`[repo] materializeAll count=${repos.length} workspace=${workspacePath}`);
console.log(
`[repo] materializeAll count=${materializableRepos.length} (${repos.length - materializableRepos.length} skipped, no source) workspace=${workspacePath}`,
);
}
for (const repo of repos) {
for (const repo of materializableRepos) {
await this.materialize(repo, workspacePath);
}
if (this.verbose) {
console.log('[repo] materializeAll complete');
}
}

/** Reset repos in workspace to their checkout state. */
/** Reset repos in workspace to their checkout state. Skips repos without path or source. */
async reset(
repos: readonly RepoConfig[],
workspacePath: string,
reset: 'fast' | 'strict',
): Promise<void> {
const cleanFlag = reset === 'strict' ? '-fdx' : '-fd';
for (const repo of repos) {
if (!repo.path || !repo.source) continue;
const targetDir = path.join(workspacePath, repo.path);
await this.runGit(['reset', '--hard', 'HEAD'], { cwd: targetDir });
await this.runGit(['clean', cleanFlag], { cwd: targetDir });
Expand Down
1 change: 0 additions & 1 deletion packages/core/src/evaluation/yaml-parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -737,7 +737,6 @@ function parseDockerWorkspaceConfig(raw: unknown): DockerWorkspaceConfig | undef
...(typeof obj.timeout === 'number' && { timeout: obj.timeout }),
...(typeof obj.memory === 'string' && { memory: obj.memory }),
...(typeof obj.cpus === 'number' && { cpus: obj.cpus }),
...(typeof obj.base_commit === 'string' && { base_commit: obj.base_commit }),
};
}

Expand Down
Loading
Loading