-
Notifications
You must be signed in to change notification settings - Fork 1
Core API Reference
Abhishek Gahlot edited this page Mar 27, 2026
·
1 revision
Module: deepgym.core
from deepgym import DeepGymDeepGym(
api_key: str | None = None,
mode: Literal['auto', 'daytona', 'local'] = 'auto',
)| Parameter | Default | Description |
|---|---|---|
api_key |
None |
Daytona API key (falls back to DAYTONA_API_KEY env var) |
mode |
'auto' |
Execution mode: auto, daytona, or local
|
Run a single solution against an environment.
result = dg.run(env, model_output='def two_sum(nums, target): ...')Raises: VerifierError, SandboxError, TimeoutError
Run multiple solutions in parallel.
batch = dg.run_batch(env, solutions, max_parallel=8)
print(f'{batch.passed}/{batch.total} passed, avg score: {batch.avg_score:.2f}')| Parameter | Type | Default |
|---|---|---|
env |
Environment |
|
outputs |
Sequence[str] |
|
max_parallel |
int |
10 |
Evaluate across a whole suite of environments.
result = dg.eval('medium', {'coin_change': code1, 'two_sum': code2}, max_parallel=100)
print(f'Pass rate: {result.pass_rate:.1%}')Suite names: easy, medium, hard, all, or family names like dynamic-programming.
Module: deepgym.async_core
from deepgym import AsyncDeepGymAsyncDeepGym(
api_key: str | None = None,
api_url: str | None = None,
default_timeout: int = 30,
mode: Literal['auto', 'daytona', 'local'] = 'auto',
)Extra parameters over the sync client:
| Parameter | Default | Description |
|---|---|---|
api_url |
None |
Remote DeepGym API endpoint (HTTP mode) |
default_timeout |
30 |
Fallback timeout in seconds |
Same methods as DeepGym, all async:
result = await dg.run(env, model_output=solution)
batch = await dg.run_batch(env, solutions, max_parallel=10)
eval_result = await dg.eval('medium', outputs, max_parallel=50)Module: deepgym.models
from deepgym import Environment
env = Environment(
task='Write a function that...',
type='coding',
verifier_code='...', # or verifier_path=Path('verifier.py')
language='python',
timeout=30,
difficulty='medium',
tags=['dp', 'array'],
test_cases=[{'input': [1,2,3], 'expected': 6}],
snapshot='python', # Daytona image name
env_vars={'DEBUG': '1'},
)| Field | Type | Default |
|---|---|---|
task |
str |
required |
type |
Literal['coding','computer-use','tool-use'] |
'coding' |
verifier_code |
str |
'' |
verifier_path |
Path | None |
None |
language |
str |
'python' |
timeout |
int |
30 |
difficulty |
Literal['easy','medium','hard'] |
'medium' |
tags |
list[str] |
[] |
test_cases |
list[dict] | None |
None |
snapshot |
str | None |
None |
env_vars |
dict[str, str] | None |
None |
Must provide either verifier_code or verifier_path.
class RunResult(BaseModel):
score: float # 0.0 to 1.0
passed: bool
output: str # verifier stdout
stderr: str
exit_code: int # 0=pass, 1=fail, 2=error
execution_time_ms: float
sandbox_id: str
reward_components: dict[str, float] | None
metrics: dict[str, Any] | None
seed: int | None
truncated: bool # True if timed out
error_type: str | None
cases: list[CaseResult] | Noneclass CaseResult(BaseModel):
id: str
passed: bool
score: float # 0.0 to 1.0
input_summary: str
expected_summary: str
actual_summary: str
error: str | None
execution_time_ms: floatclass BatchResult(BaseModel):
results: list[RunResult]
total: int
passed: int
failed: int
avg_score: float
execution_time_ms: floatclass EvalResult(BaseModel):
suite: str
model_name: str
pass_rate: float
results: list[RunResult]
total: int
passed: int
avg_score: floatModule: deepgym.exceptions
DeepGymError (base)
|-- VerifierError # verifier crashed or bad JSON
|-- SandboxError # sandbox creation/execution failure
|-- TimeoutError # exceeded env.timeout
from deepgym import load_environment, list_environments, load_suite
env = load_environment('coin_change') # load by name
envs = list_environments() # list all available
easy = load_suite('easy') # load a suite