Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
450 changes: 450 additions & 0 deletions MODEL-CONFIGS.md

Large diffs are not rendered by default.

53 changes: 48 additions & 5 deletions config/gni/devtools_grd_files.gni
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,19 @@ grd_files_bundled_sources = [
"front_end/Images/whatsnew.svg",
"front_end/Images/width.svg",
"front_end/Images/zoom-in.svg",
"front_end/Images/asana-mcp.svg",
"front_end/Images/atlassian-mcp.svg",
"front_end/Images/github-mcp.svg",
"front_end/Images/google-drive-mcp.svg",
"front_end/Images/google-sheets-mcp.svg",
"front_end/Images/huggingface-mcp.svg",
"front_end/Images/intercom-mcp.svg",
"front_end/Images/invideo-mcp.svg",
"front_end/Images/linear-mcp.svg",
"front_end/Images/notion-mcp.svg",
"front_end/Images/sentry-mcp.svg",
"front_end/Images/slack-mcp.svg",
"front_end/Images/socket-mcp.svg",
"front_end/Tests.js",
"front_end/application_tokens.css",
"front_end/core/common/common.js",
Expand Down Expand Up @@ -635,13 +648,17 @@ grd_files_bundled_sources = [
"front_end/panels/ai_chat/ui/AgentSessionHeaderComponent.js",
"front_end/panels/ai_chat/ui/ToolDescriptionFormatter.js",
"front_end/panels/ai_chat/ui/chatView.css.js",
"front_end/panels/ai_chat/ui/mcp/mcpConnectorsCatalogDialog.css.js",
"front_end/panels/ai_chat/ui/HelpDialog.js",
"front_end/panels/ai_chat/ui/PromptEditDialog.js",
"front_end/panels/ai_chat/ui/SettingsDialog.js",
"front_end/panels/ai_chat/ui/mcp/MCPConnectionsDialog.js",
"front_end/panels/ai_chat/ui/mcp/MCPConnectorsCatalogDialog.js",
"front_end/panels/ai_chat/ui/EvaluationDialog.js",
"front_end/panels/ai_chat/core/AgentService.js",
"front_end/panels/ai_chat/core/State.js",
"front_end/panels/ai_chat/core/Graph.js",
"front_end/panels/ai_chat/core/BuildConfig.js",
"front_end/panels/ai_chat/core/Types.js",
"front_end/panels/ai_chat/core/Constants.js",
"front_end/panels/ai_chat/core/ConfigurableGraph.js",
Expand All @@ -655,6 +672,7 @@ grd_files_bundled_sources = [
"front_end/panels/ai_chat/core/StateGraph.js",
"front_end/panels/ai_chat/core/Logger.js",
"front_end/panels/ai_chat/core/AgentErrorHandler.js",
"front_end/panels/ai_chat/core/AgentDescriptorRegistry.js",
"front_end/panels/ai_chat/core/Version.js",
"front_end/panels/ai_chat/core/VersionChecker.js",
"front_end/panels/ai_chat/LLM/LLMTypes.js",
Expand Down Expand Up @@ -720,6 +738,20 @@ grd_files_bundled_sources = [
"front_end/panels/ai_chat/agent_framework/AgentSessionTypes.js",
"front_end/panels/ai_chat/agent_framework/ConfigurableAgentTool.js",
"front_end/panels/ai_chat/agent_framework/implementation/ConfiguredAgents.js",
"front_end/panels/ai_chat/agent_framework/implementation/agents/ActionAgent.js",
"front_end/panels/ai_chat/agent_framework/implementation/agents/ActionVerificationAgent.js",
"front_end/panels/ai_chat/agent_framework/implementation/agents/AgentVersion.js",
"front_end/panels/ai_chat/agent_framework/implementation/agents/ClickActionAgent.js",
"front_end/panels/ai_chat/agent_framework/implementation/agents/ContentWriterAgent.js",
"front_end/panels/ai_chat/agent_framework/implementation/agents/DirectURLNavigatorAgent.js",
"front_end/panels/ai_chat/agent_framework/implementation/agents/EcommerceProductInfoAgent.js",
"front_end/panels/ai_chat/agent_framework/implementation/agents/FormFillActionAgent.js",
"front_end/panels/ai_chat/agent_framework/implementation/agents/HoverActionAgent.js",
"front_end/panels/ai_chat/agent_framework/implementation/agents/KeyboardInputActionAgent.js",
"front_end/panels/ai_chat/agent_framework/implementation/agents/ResearchAgent.js",
"front_end/panels/ai_chat/agent_framework/implementation/agents/ScrollActionAgent.js",
"front_end/panels/ai_chat/agent_framework/implementation/agents/WebTaskAgent.js",
"front_end/panels/ai_chat/agent_framework/implementation/agents/SearchAgent.js",
"front_end/panels/ai_chat/common/MarkdownViewerUtil.js",
"front_end/panels/ai_chat/evaluation/runner/VisionAgentEvaluationRunner.js",
"front_end/panels/ai_chat/evaluation/runner/EvaluationRunner.js",
Expand Down Expand Up @@ -880,6 +912,7 @@ grd_files_bundled_sources = [
"front_end/third_party/lighthouse/report/report.js",
"front_end/third_party/lit/lit.js",
"front_end/third_party/mcp-sdk/mcp-sdk.js",
"front_end/third_party/mcp-sdk/mcp-sdk-v2.js",
"front_end/third_party/marked/marked.js",
"front_end/third_party/puppeteer-replay/puppeteer-replay.js",
"front_end/third_party/puppeteer/puppeteer.js",
Expand Down Expand Up @@ -2239,15 +2272,25 @@ grd_files_unbundled_sources = [
"front_end/third_party/lit/lib/static-html.js",
"front_end/third_party/marked/package/lib/marked.esm.js",
"front_end/third_party/mcp-sdk/ajv/dist/ajv.js",
"front_end/third_party/mcp-sdk/ajv/dist/ajv.bundle.js",
"front_end/third_party/mcp-sdk/ajv/dist/ajv-esm.js",
"front_end/third_party/mcp-sdk/eventsource-parser/package/dist/index.js",
"front_end/third_party/mcp-sdk/eventsource-parser/package/dist/stream.js",
"front_end/third_party/mcp-sdk/package/dist/client/index.js",
"front_end/third_party/mcp-sdk/package/dist/client/sse.js",
"front_end/third_party/mcp-sdk/package/dist/shared/protocol.js",
"front_end/third_party/mcp-sdk/package/dist/shared/transport.js",
"front_end/third_party/mcp-sdk/package/dist/types.js",
"front_end/third_party/mcp-sdk/dist/esm/client/auth.js",
"front_end/third_party/mcp-sdk/dist/esm/client/index.js",
"front_end/third_party/mcp-sdk/dist/esm/client/sse.js",
"front_end/third_party/mcp-sdk/dist/esm/client/streamableHttp.js",
"front_end/third_party/mcp-sdk/dist/esm/server/index.js",
"front_end/third_party/mcp-sdk/dist/esm/server/auth/errors.js",
"front_end/third_party/mcp-sdk/dist/esm/shared/auth.js",
"front_end/third_party/mcp-sdk/dist/esm/shared/auth-utils.js",
"front_end/third_party/mcp-sdk/dist/esm/shared/protocol.js",
"front_end/third_party/mcp-sdk/dist/esm/shared/transport.js",
"front_end/third_party/mcp-sdk/dist/esm/types.js",
"front_end/third_party/mcp-sdk/dist/zod/zod-esm.js",
"front_end/third_party/mcp-sdk/zod/lib/index.js",
"front_end/third_party/mcp-sdk/zod/lib/index.mjs",
"front_end/third_party/mcp-sdk/zod/zod-esm.js",
"front_end/third_party/puppeteer-replay/package/lib/main.js",
"front_end/third_party/puppeteer/package/lib/esm/puppeteer/api/Browser.js",
"front_end/third_party/puppeteer/package/lib/esm/puppeteer/api/BrowserContext.js",
Expand Down
13 changes: 13 additions & 0 deletions config/gni/devtools_image_files.gni
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,19 @@ devtools_svg_sources = [
"whatsnew.svg",
"width.svg",
"zoom-in.svg",
"asana-mcp.svg",
"atlassian-mcp.svg",
"github-mcp.svg",
"google-drive-mcp.svg",
"google-sheets-mcp.svg",
"huggingface-mcp.svg",
"intercom-mcp.svg",
"invideo-mcp.svg",
"linear-mcp.svg",
"notion-mcp.svg",
"sentry-mcp.svg",
"slack-mcp.svg",
"socket-mcp.svg",
]

devtools_src_svg_files = []
Expand Down
45 changes: 45 additions & 0 deletions eval-server/nodejs/.env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# Evaluation Server Configuration
# Copy this file to .env and configure your settings

# Server Configuration
PORT=8080
HOST=127.0.0.1

# LLM Provider API Keys
# Configure one or more providers for evaluation

# OpenAI Configuration
OPENAI_API_KEY=sk-your-openai-api-key-here

# LiteLLM Configuration (if using a LiteLLM server)
LITELLM_ENDPOINT=http://localhost:4000
LITELLM_API_KEY=your-litellm-api-key-here

# Groq Configuration
GROQ_API_KEY=gsk_your-groq-api-key-here

# OpenRouter Configuration
OPENROUTER_API_KEY=sk-or-v1-your-openrouter-api-key-here

# Default LLM Configuration for Evaluations
# These will be used as fallbacks when not specified in evaluation requests
DEFAULT_PROVIDER=openai
DEFAULT_MAIN_MODEL=gpt-4
DEFAULT_MINI_MODEL=gpt-4-mini
DEFAULT_NANO_MODEL=gpt-3.5-turbo

# Logging Configuration
LOG_LEVEL=info
LOG_DIR=./logs

# Client Configuration
CLIENTS_DIR=./clients
EVALS_DIR=./evals

# RPC Configuration
RPC_TIMEOUT=30000

# Security
# Set this to enable authentication for client connections
# Leave empty to disable authentication
AUTH_SECRET_KEY=
73 changes: 71 additions & 2 deletions eval-server/nodejs/CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,16 @@ bo-eval-server is a WebSocket-based evaluation server for LLM agents that implem
- `OPENAI_API_KEY` - OpenAI API key for LLM judge functionality
- `PORT` - WebSocket server port (default: 8080)

### LLM Provider Configuration (Optional)
- `GROQ_API_KEY` - Groq API key for Groq provider support
- `OPENROUTER_API_KEY` - OpenRouter API key for OpenRouter provider support
- `LITELLM_ENDPOINT` - LiteLLM server endpoint URL
- `LITELLM_API_KEY` - LiteLLM API key for LiteLLM provider support
- `DEFAULT_PROVIDER` - Default LLM provider (openai, groq, openrouter, litellm)
- `DEFAULT_MAIN_MODEL` - Default main model name
- `DEFAULT_MINI_MODEL` - Default mini model name
- `DEFAULT_NANO_MODEL` - Default nano model name

## Architecture

### Core Components
Expand All @@ -33,10 +43,11 @@ bo-eval-server is a WebSocket-based evaluation server for LLM agents that implem
- Handles bidirectional RPC communication

**RPC Client** (`src/rpc-client.js`)
- Implements JSON-RPC 2.0 protocol for server-to-client calls
- Implements JSON-RPC 2.0 protocol for bidirectional communication
- Manages request/response correlation with unique IDs
- Handles timeouts and error conditions
- Calls `Evaluate(request: String) -> String` method on connected agents
- Supports `configure_llm` method for dynamic LLM provider configuration

**LLM Evaluator** (`src/evaluator.js`)
- Integrates with OpenAI API for LLM-as-a-judge functionality
Expand Down Expand Up @@ -78,7 +89,10 @@ logs/ # Log files (created automatically)
### Key Features

- **Bidirectional RPC**: Server can call methods on connected clients
- **LLM-as-a-Judge**: Automated evaluation of agent responses using GPT-4
- **Multi-Provider LLM Support**: Support for OpenAI, Groq, OpenRouter, and LiteLLM providers
- **Dynamic LLM Configuration**: Runtime configuration via `configure_llm` JSON-RPC method
- **Per-Client Configuration**: Each connected client can have different LLM settings
- **LLM-as-a-Judge**: Automated evaluation of agent responses using configurable LLM providers
- **Concurrent Evaluations**: Support for multiple agents and parallel evaluations
- **Structured Logging**: All interactions logged as JSON for analysis
- **Interactive CLI**: Built-in CLI for testing and server management
Expand All @@ -93,6 +107,61 @@ Agents must implement:
- `Evaluate(task: string) -> string` method
- "ready" message to signal availability for evaluations

### LLM Configuration Protocol

The server supports dynamic LLM configuration via the `configure_llm` JSON-RPC method:

```json
{
"jsonrpc": "2.0",
"method": "configure_llm",
"params": {
"provider": "openai|groq|openrouter|litellm",
"apiKey": "your-api-key",
"endpoint": "endpoint-url-for-litellm",
"models": {
"main": "main-model-name",
"mini": "mini-model-name",
"nano": "nano-model-name"
},
"partial": false
},
"id": "config-request-id"
}
```

### Evaluation Model Configuration

Evaluations support nested model configuration for flexible per-tier settings:

```json
{
"jsonrpc": "2.0",
"method": "evaluate",
"params": {
"tool": "chat",
"input": {"message": "Hello"},
"model": {
"main_model": {
"provider": "openai",
"model": "gpt-4",
"api_key": "sk-main-key"
},
"mini_model": {
"provider": "openai",
"model": "gpt-4-mini",
"api_key": "sk-mini-key"
},
"nano_model": {
"provider": "groq",
"model": "llama-3.1-8b-instant",
"api_key": "gsk-nano-key"
}
}
}
}
```

### Configuration

All configuration is managed through environment variables and `src/config.js`. Key settings:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ target:
wait_for: "networkidle"
wait_timeout: 5000

tool: "extract_schema_data"
tool: "extract_data"
timeout: 60000

input:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ target:
wait_for: "networkidle"
wait_timeout: 5000

tool: "extract_schema_data"
tool: "extract_data"
timeout: 30000

input:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ target:
wait_for: "networkidle"
wait_timeout: 5000

tool: "extract_schema_data"
tool: "extract_data"
timeout: 45000

input:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ target:
wait_for: "networkidle"
wait_timeout: 5000

tool: "extract_schema_data"
tool: "extract_data"
timeout: 30000

input:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ target:
wait_for: "networkidle"
wait_timeout: 5000

tool: "extract_schema_data"
tool: "extract_data"
timeout: 60000

input:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ target:
wait_for: "networkidle"
wait_timeout: 5000

tool: "extract_schema_data"
tool: "extract_data"
timeout: 45000

Comment on lines +12 to 14
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

🧩 Analysis chain

Tool rename to "extract_data" and longer timeout look good; verify end‑to‑end support.

Confirm server/tool registry, schemas, and tests accept "extract_data" (no lingering "extract_schema_data" usage).


🏁 Script executed:

#!/bin/bash
# Check for any remaining references to the old tool name
rg -nP '\bextract_schema_data\b' -C2

# Spot YAML evals still using the old tool field
rg -nP --type=yaml -n -C2 '\btool:\s*"?extract_schema_data"?'

# Sanity-check that the new name is present in configs and schema enums
rg -nP -n -C2 '\btool:\s*"?extract_data"?'
rg -nP -n -C3 '"evaluation"\s*:.*\{[^}]*"tool"[^}]*\benum\b[^}]*\bextract_data\b'

Length of output: 9746


Replace remaining "extract_schema_data" references with "extract_data" and confirm server/tool registry + tests accept the new name.

Docs/examples still use the old tool name while eval YAMLs/templates use "extract_data" — update the docs/snippets and ensure any server enums/switches or tests accept "extract_data".

Affected files (examples found):

  • eval-server/nodejs/docs/PROTOCOL.md
  • eval-server/nodejs/docs/YAML_SCHEMA.md
  • eval-server/nodejs/docs/CLIENT_SETUP.md
  • eval-server/nodejs/docs/TRIGGERING_EVALUATIONS.md
🤖 Prompt for AI Agents
In eval-server/nodejs/evals/schema-extractor/google-search-001.yaml around lines
12 to 14, the YAML uses the new tool name "extract_data" but other docs and code
still reference "extract_schema_data"; search and replace all remaining
"extract_schema_data" occurrences with "extract_data" in the listed docs
(eval-server/nodejs/docs/PROTOCOL.md, YAML_SCHEMA.md, CLIENT_SETUP.md,
TRIGGERING_EVALUATIONS.md) and update any server-side registries/enums/switch
statements and unit/integration tests that validate tool names to accept
"extract_data" (or map the old name to the new one). After changes, run the test
suite and CI to ensure no references remain and the new name is accepted.

input:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ target:
wait_for: "networkidle"
wait_timeout: 5000

tool: "extract_schema_data"
tool: "extract_data"
timeout: 60000

input:
Expand Down
2 changes: 1 addition & 1 deletion eval-server/nodejs/evals/schema-extractor/macys-001.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ target:
wait_for: "networkidle"
wait_timeout: 5000

tool: "extract_schema_data"
tool: "extract_data"
timeout: 60000

input:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ target:
wait_for: "networkidle"
wait_timeout: 5000

tool: "extract_schema_data"
tool: "extract_data"
timeout: 30000

input:
Expand Down
2 changes: 1 addition & 1 deletion eval-server/nodejs/evals/web-task-agent/jobs-001.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ validation:
- "Either used construct_direct_url for LinkedIn job search OR used traditional form interaction"
- "If using direct URL: constructed proper LinkedIn job search URL with keywords and location"
- "If using forms: delegated keyword and location input to action_agent"
- "Extracted job listings using schema_based_extractor"
- "Extracted job listings using extract_data"
- "Returned structured job data in readable text format (not JSON)"
- "Each job listing includes title, company, location, and other relevant fields"
- "Results are numbered or organized clearly for easy reading"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ validation:
- "Delegated price filter setting to action_agent"
- "Coordinated property type selection through action_agent"
- "Applied search filters through proper action_agent calls"
- "Extracted property listings with schema_based_extractor"
- "Extracted property listings with extract_data"
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

🧩 Analysis chain

Ensure repo-wide extractor rename alignment.

Double-check that client schema/tool enums and templates now use extract_data.

Run:


🏁 Script executed:

#!/bin/bash
set -euo pipefail

echo "Find lingering old extractor names..."
rg -n -C1 -g '!**/node_modules/**' -e 'schema_based_extractor' -e 'extract_schema_data' || true

echo
echo "Validate schema/tool enums reference extract_data:"
fd default-client.yaml | xargs -I{} rg -n -C1 'extract_data' {}
fd client.schema.json | xargs -I{} rg -n -C2 'extract_data|extract_schema_data|schema_based_extractor' {}

Length of output: 465


Ensure repo-wide extractor rename alignment

default-client.yaml already uses "extract_data" (lines 29–31) but client.schema.json still enumerates "extract_schema_streamlined" alongside "extract_data" (lines 131–135); update/remove the legacy enum value and align any templates/libs to "extract_data", then run a repo-wide search for other old extractor names.

🤖 Prompt for AI Agents
In eval-server/nodejs/evals/web-task-agent/realestate-001.yaml around line 52
the extractor name is "extract_data" but client.schema.json (around lines
131–135) still includes the legacy enum value "extract_schema_streamlined";
remove that legacy enum entry from client.schema.json (leaving only
"extract_data"), update any templates or library references that still use
"extract_schema_streamlined" to "extract_data", and run a repo-wide
search/replace to ensure all extractor names are aligned and update tests or
docs if required.

- "Returned structured property data in readable text format (not JSON)"
- "Each property includes address, price, bedrooms, bathrooms, and other key details"
- "Properties are clearly numbered or organized for easy comparison"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ validation:
- "Either used construct_direct_url for LinkedIn job search OR used traditional form interaction"
- "If using direct URL: constructed proper LinkedIn job search URL with keywords and location"
- "If using forms: delegated keyword and location input to action_agent"
- "Extracted job listings using schema_based_extractor"
- "Extracted job listings using extract_data"
- "Returned structured job data in readable text format (not JSON)"
- "Each job listing includes title, company, location, and other relevant fields"
- "Results are numbered or organized clearly for easy reading"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ validation:
- "Delegated price filter setting to action_agent"
- "Coordinated property type selection through action_agent"
- "Applied search filters through proper action_agent calls"
- "Extracted property listings with schema_based_extractor"
- "Extracted property listings with extract_data"
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

🧩 Analysis chain

Consistency check: fully migrate to extract_data across configs/schemas.

Ensure no lingering references to schema_based_extractor or extract_schema_data remain in evals, templates, or schemas.

Run to verify:


🏁 Script executed:

#!/bin/bash
set -euo pipefail

echo "Searching for old extractor names..."
rg -n -C1 -g '!**/node_modules/**' -e 'schema_based_extractor' -e 'extract_schema_data' || true

echo
echo "Confirming extract_data is used for evaluation.tool values..."
rg -n -C1 -g '!**/node_modules/**' -e 'tool:\s*"extract_data"' -e '"tool"\s*:\s*"extract_data"' || true

echo
echo "Check client schema enum still doesn't reference old names..."
fd client.schema.json | xargs -I{} rg -n -C2 -e 'extract_schema_data|schema_based_extractor|extract_data' {}

Length of output: 629


Consistency check — fully migrate to extract_data across configs/schemas.
No matches for schema_based_extractor or extract_schema_data, but client.schema.json's enum still contains "extract_schema_streamlined" alongside "extract_data" — remove/rename that legacy entry so only "extract_data" remains.
client.schema.json (enum, ~lines 131–135)

🤖 Prompt for AI Agents
In eval-server/nodejs/evals/web-task-agent/web-task-agent-realestate-001.yaml
around line 52 and in client.schema.json around lines 131–135, the enum still
contains the legacy value "extract_schema_streamlined" alongside "extract_data";
remove (or rename) that legacy enum entry so the enum only includes
"extract_data", and scan the repo/configs for any remaining references to
"extract_schema_streamlined" (rename them to "extract_data" or delete) to
complete the migration to extract_data.

- "Returned structured property data in readable text format (not JSON)"
- "Each property includes address, price, bedrooms, bathrooms, and other key details"
- "Properties are clearly numbered or organized for easy comparison"
Expand Down
Loading